Importing datasets¶
import pandas as pd
customers = pd.read_csv('olist_customers_dataset.csv')
customers.head()
| customer_id | customer_unique_id | customer_zip_code_prefix | customer_city | customer_state | |
|---|---|---|---|---|---|
| 0 | 06b8999e2fba1a1fbc88172c00ba8bc7 | 861eff4711a542e4b93843c6dd7febb0 | 14409 | franca | SP |
| 1 | 18955e83d337fd6b2def6b18a428ac77 | 290c77bc529b7ac935b93aa66c333dc3 | 9790 | sao bernardo do campo | SP |
| 2 | 4e7b3e00288586ebd08712fdd0374a03 | 060e732b5b29e8181a18229c7b0b2b5e | 1151 | sao paulo | SP |
| 3 | b2b6027bc5c5109e529d4dc6358b12c3 | 259dac757896d24d7702b9acbbff3f3c | 8775 | mogi das cruzes | SP |
| 4 | 4f2d8ab171c80ec8364f7c12e35b23ad | 345ecd01c38d18a9036ed96c73b8d066 | 13056 | campinas | SP |
geolocation = pd.read_csv('olist_geolocation_dataset.csv')
geolocation.head()
| geolocation_zip_code_prefix | geolocation_lat | geolocation_lng | geolocation_city | geolocation_state | |
|---|---|---|---|---|---|
| 0 | 1037 | -23.545621 | -46.639292 | sao paulo | SP |
| 1 | 1046 | -23.546081 | -46.644820 | sao paulo | SP |
| 2 | 1046 | -23.546129 | -46.642951 | sao paulo | SP |
| 3 | 1041 | -23.544392 | -46.639499 | sao paulo | SP |
| 4 | 1035 | -23.541578 | -46.641607 | sao paulo | SP |
order_items = pd.read_csv('olist_order_items_dataset.csv')
order_items.head()
| order_id | order_item_id | product_id | seller_id | shipping_limit_date | price | freight_value | |
|---|---|---|---|---|---|---|---|
| 0 | 00010242fe8c5a6d1ba2dd792cb16214 | 1 | 4244733e06e7ecb4970a6e2683c13e61 | 48436dade18ac8b2bce089ec2a041202 | 2017-09-19 09:45:35 | 58.90 | 13.29 |
| 1 | 00018f77f2f0320c557190d7a144bdd3 | 1 | e5f2d52b802189ee658865ca93d83a8f | dd7ddc04e1b6c2c614352b383efe2d36 | 2017-05-03 11:05:13 | 239.90 | 19.93 |
| 2 | 000229ec398224ef6ca0657da4fc703e | 1 | c777355d18b72b67abbeef9df44fd0fd | 5b51032eddd242adc84c38acab88f23d | 2018-01-18 14:48:30 | 199.00 | 17.87 |
| 3 | 00024acbcdf0a6daa1e931b038114c75 | 1 | 7634da152a4610f1595efa32f14722fc | 9d7a1d34a5052409006425275ba1c2b4 | 2018-08-15 10:10:18 | 12.99 | 12.79 |
| 4 | 00042b26cf59d7ce69dfabb4e55b4fd9 | 1 | ac6c3623068f30de03045865e4e10089 | df560393f3a51e74553ab94004ba5c87 | 2017-02-13 13:57:51 | 199.90 | 18.14 |
order_payments = pd.read_csv('olist_order_payments_dataset.csv')
order_payments.head()
| order_id | payment_sequential | payment_type | payment_installments | payment_value | |
|---|---|---|---|---|---|
| 0 | b81ef226f3fe1789b1e8b2acac839d17 | 1 | credit_card | 8 | 99.33 |
| 1 | a9810da82917af2d9aefd1278f1dcfa0 | 1 | credit_card | 1 | 24.39 |
| 2 | 25e8ea4e93396b6fa0d3dd708e76c1bd | 1 | credit_card | 1 | 65.71 |
| 3 | ba78997921bbcdc1373bb41e913ab953 | 1 | credit_card | 8 | 107.78 |
| 4 | 42fdf880ba16b47b59251dd489d4441a | 1 | credit_card | 2 | 128.45 |
reviews = pd.read_csv('olist_order_reviews_dataset.csv')
reviews.head()
| review_id | order_id | review_score | review_comment_title | review_comment_message | review_creation_date | review_answer_timestamp | |
|---|---|---|---|---|---|---|---|
| 0 | 7bc2406110b926393aa56f80a40eba40 | 73fc7af87114b39712e6da79b0a377eb | 4 | NaN | NaN | 2018-01-18 00:00:00 | 2018-01-18 21:46:59 |
| 1 | 80e641a11e56f04c1ad469d5645fdfde | a548910a1c6147796b98fdf73dbeba33 | 5 | NaN | NaN | 2018-03-10 00:00:00 | 2018-03-11 03:05:13 |
| 2 | 228ce5500dc1d8e020d8d1322874b6f0 | f9e4b658b201a9f2ecdecbb34bed034b | 5 | NaN | NaN | 2018-02-17 00:00:00 | 2018-02-18 14:36:24 |
| 3 | e64fb393e7b32834bb789ff8bb30750e | 658677c97b385a9be170737859d3511b | 5 | NaN | Recebi bem antes do prazo estipulado. | 2017-04-21 00:00:00 | 2017-04-21 22:02:06 |
| 4 | f7c4243c7fe1938f181bec41a392bdeb | 8e6bfb81e283fa7e4f11123a3fb894f1 | 5 | NaN | Parabéns lojas lannister adorei comprar pela I... | 2018-03-01 00:00:00 | 2018-03-02 10:26:53 |
orders = pd.read_csv('olist_orders_dataset.csv')
orders.head()
| order_id | customer_id | order_status | order_purchase_timestamp | order_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date | |
|---|---|---|---|---|---|---|---|---|
| 0 | e481f51cbdc54678b7cc49136f2d6af7 | 9ef432eb6251297304e76186b10a928d | delivered | 2017-10-02 10:56:33 | 2017-10-02 11:07:15 | 2017-10-04 19:55:00 | 2017-10-10 21:25:13 | 2017-10-18 00:00:00 |
| 1 | 53cdb2fc8bc7dce0b6741e2150273451 | b0830fb4747a6c6d20dea0b8c802d7ef | delivered | 2018-07-24 20:41:37 | 2018-07-26 03:24:27 | 2018-07-26 14:31:00 | 2018-08-07 15:27:45 | 2018-08-13 00:00:00 |
| 2 | 47770eb9100c2d0c44946d9cf07ec65d | 41ce2a54c0b03bf3443c3d931a367089 | delivered | 2018-08-08 08:38:49 | 2018-08-08 08:55:23 | 2018-08-08 13:50:00 | 2018-08-17 18:06:29 | 2018-09-04 00:00:00 |
| 3 | 949d5b44dbf5de918fe9c16f97b45f8a | f88197465ea7920adcdbec7375364d82 | delivered | 2017-11-18 19:28:06 | 2017-11-18 19:45:59 | 2017-11-22 13:39:59 | 2017-12-02 00:28:42 | 2017-12-15 00:00:00 |
| 4 | ad21c59c0840e6cb83a9ceb5573f8159 | 8ab97904e6daea8866dbdbc4fb7aad2c | delivered | 2018-02-13 21:18:39 | 2018-02-13 22:20:29 | 2018-02-14 19:46:34 | 2018-02-16 18:17:02 | 2018-02-26 00:00:00 |
products = pd.read_csv('olist_products_dataset.csv')
products.head()
| product_id | product_category_name | product_name_lenght | product_description_lenght | product_photos_qty | product_weight_g | product_length_cm | product_height_cm | product_width_cm | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 1e9e8ef04dbcff4541ed26657ea517e5 | perfumaria | 40.0 | 287.0 | 1.0 | 225.0 | 16.0 | 10.0 | 14.0 |
| 1 | 3aa071139cb16b67ca9e5dea641aaa2f | artes | 44.0 | 276.0 | 1.0 | 1000.0 | 30.0 | 18.0 | 20.0 |
| 2 | 96bd76ec8810374ed1b65e291975717f | esporte_lazer | 46.0 | 250.0 | 1.0 | 154.0 | 18.0 | 9.0 | 15.0 |
| 3 | cef67bcfe19066a932b7673e239eb23d | bebes | 27.0 | 261.0 | 1.0 | 371.0 | 26.0 | 4.0 | 26.0 |
| 4 | 9dc1a7de274444849c219cff195d0b71 | utilidades_domesticas | 37.0 | 402.0 | 4.0 | 625.0 | 20.0 | 17.0 | 13.0 |
sellers = pd.read_csv('olist_sellers_dataset.csv')
sellers.head()
| seller_id | seller_zip_code_prefix | seller_city | seller_state | |
|---|---|---|---|---|
| 0 | 3442f8959a84dea7ee197c632cb2df15 | 13023 | campinas | SP |
| 1 | d1b65fc7debc3361ea86b5f14c68d2e2 | 13844 | mogi guacu | SP |
| 2 | ce3ad9de960102d0677a81f5d0bb7b2d | 20031 | rio de janeiro | RJ |
| 3 | c0f3eea2e14555b6faeea3dd58c1b1c3 | 4195 | sao paulo | SP |
| 4 | 51a04a8a6bdcb23deccc82b0b80742cf | 12914 | braganca paulista | SP |
product_names = pd.read_csv('product_category_name_translation.csv')
product_names.head()
| product_category_name | product_category_name_english | |
|---|---|---|
| 0 | beleza_saude | health_beauty |
| 1 | informatica_acessorios | computers_accessories |
| 2 | automotivo | auto |
| 3 | cama_mesa_banho | bed_bath_table |
| 4 | moveis_decoracao | furniture_decor |
# @title Setting style for charts
import warnings
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import logging
# Suppress all warnings
warnings.filterwarnings("ignore")
# Suppress matplotlib font-related and other warnings from logging
logging.getLogger('matplotlib').setLevel(logging.ERROR)
# Set "serif" as the global font family with specific styling parameters
mpl.rcParams.update({
"font.family": "serif",
"font.serif": ["Liberation Serif"], # Define fallback fonts if Times New Roman is unavailable
"font.weight": "normal",
"axes.titlesize": 16,
"axes.labelsize": 12,
"xtick.labelsize": 10,
"ytick.labelsize": 10,
})
# Set a Viridis color palette
viridis_colors = plt.cm.viridis(np.linspace(0, 1, 10))
Inspecting datasets, columns, unique and missing values¶
# @title Dataset Info
datasets = {
'customers': customers,
'geolocation': geolocation,
'order_items': order_items,
'order_payments': order_payments,
'reviews': reviews,
'orders': orders,
'products': products,
'sellers': sellers,
'product_names': product_names
}
for dataset_names, dataset in datasets.items():
print('Dataset name: ', dataset_names)
print(f'Shape of dataset: {dataset.shape}')
column_info = []
for column in dataset.columns:
col_name = column
dtype = dataset[column].dtype
unique_values = dataset[column].nunique()
missing_vals = dataset[column].isnull().sum()
column_info.append(
{'Column name': col_name,
'Data type': dtype,
'Unique values': unique_values,
'Missing values': missing_vals
})
column_info_df = pd.DataFrame(column_info)
print(column_info_df)
print('------------------------------------------------------------------------')
Dataset name: customers
Shape of dataset: (99441, 5)
Column name Data type Unique values Missing values
0 customer_id object 99441 0
1 customer_unique_id object 96096 0
2 customer_zip_code_prefix int64 14994 0
3 customer_city object 4119 0
4 customer_state object 27 0
------------------------------------------------------------------------
Dataset name: geolocation
Shape of dataset: (1000163, 5)
Column name Data type Unique values Missing values
0 geolocation_zip_code_prefix int64 19015 0
1 geolocation_lat float64 717360 0
2 geolocation_lng float64 717613 0
3 geolocation_city object 8011 0
4 geolocation_state object 27 0
------------------------------------------------------------------------
Dataset name: order_items
Shape of dataset: (112650, 7)
Column name Data type Unique values Missing values
0 order_id object 98666 0
1 order_item_id int64 21 0
2 product_id object 32951 0
3 seller_id object 3095 0
4 shipping_limit_date object 93318 0
5 price float64 5968 0
6 freight_value float64 6999 0
------------------------------------------------------------------------
Dataset name: order_payments
Shape of dataset: (103886, 5)
Column name Data type Unique values Missing values
0 order_id object 99440 0
1 payment_sequential int64 29 0
2 payment_type object 5 0
3 payment_installments int64 24 0
4 payment_value float64 29077 0
------------------------------------------------------------------------
Dataset name: reviews
Shape of dataset: (99224, 7)
Column name Data type Unique values Missing values
0 review_id object 98410 0
1 order_id object 98673 0
2 review_score int64 5 0
3 review_comment_title object 4527 87656
4 review_comment_message object 36159 58247
5 review_creation_date object 636 0
6 review_answer_timestamp object 98248 0
------------------------------------------------------------------------
Dataset name: orders
Shape of dataset: (99441, 8)
Column name Data type Unique values Missing values
0 order_id object 99441 0
1 customer_id object 99441 0
2 order_status object 8 0
3 order_purchase_timestamp object 98875 0
4 order_approved_at object 90733 160
5 order_delivered_carrier_date object 81018 1783
6 order_delivered_customer_date object 95664 2965
7 order_estimated_delivery_date object 459 0
------------------------------------------------------------------------
Dataset name: products
Shape of dataset: (32951, 9)
Column name Data type Unique values Missing values
0 product_id object 32951 0
1 product_category_name object 73 610
2 product_name_lenght float64 66 610
3 product_description_lenght float64 2960 610
4 product_photos_qty float64 19 610
5 product_weight_g float64 2204 2
6 product_length_cm float64 99 2
7 product_height_cm float64 102 2
8 product_width_cm float64 95 2
------------------------------------------------------------------------
Dataset name: sellers
Shape of dataset: (3095, 4)
Column name Data type Unique values Missing values
0 seller_id object 3095 0
1 seller_zip_code_prefix int64 2246 0
2 seller_city object 611 0
3 seller_state object 23 0
------------------------------------------------------------------------
Dataset name: product_names
Shape of dataset: (71, 2)
Column name Data type Unique values Missing values
0 product_category_name object 71 0
1 product_category_name_english object 71 0
------------------------------------------------------------------------
First glimpse: Visualizing the data¶
# @title Sellers by Count of Orders Filled
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
# Calculate order counts for each seller
value_counts = order_items['seller_id'].value_counts()
# Define bins and labels for categorizing sellers
bins = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 30, 40, 50, 75, 100, 500, 1000, float('inf')]
labels = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '10-15', '15-20', '20-30', '30-40', '40-50', '50-75', '75-100', '100-500', '500-1000', '>1000']
# Categorize sellers based on order count
df_cut = pd.cut(value_counts, bins=bins, labels=labels)
seller_value_counts = df_cut.value_counts().sort_index()
# Set global font and styling
mpl.rcParams.update({
"font.family": "serif",
"axes.titlesize": 16,
"axes.labelsize": 12,
"xtick.labelsize": 10,
"ytick.labelsize": 10,
})
# Plotting with customized style
plt.figure(figsize=(10, 6))
seller_value_counts.plot(kind='bar', color=plt.cm.viridis(0.6)) # Use Viridis color
# Customize title and labels
plt.title('Number of Sellers Based on the Count of Orders Filled', fontsize=16)
plt.xlabel('Number of Orders', fontsize=12)
plt.ylabel('Number of Sellers', fontsize=12)
# Customize spines
ax = plt.gca()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
# Rotate x-axis labels for readability
plt.xticks(rotation=45, ha='right')
plt.show()
# @title Total Orders per Month
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
# Set date columns to datetime
date_columns = ['order_purchase_timestamp', 'order_approved_at', 'order_delivered_carrier_date', 'order_delivered_customer_date', 'order_estimated_delivery_date']
for col in date_columns:
orders[col] = pd.to_datetime(orders[col])
# Extract year and month, exclude months from 2016
orders['year_month'] = orders['order_purchase_timestamp'].dt.to_period('M')
orders = orders[orders['order_purchase_timestamp'].dt.year > 2016]
# Group orders by month
grouped = orders.groupby('year_month').size().reset_index(name='order_count')
# Set font and style globally
mpl.rcParams.update({
"font.family": "serif",
"axes.titlesize": 16,
"axes.labelsize": 12,
"xtick.labelsize": 10,
"ytick.labelsize": 10,
})
# Plotting with customized style
plt.figure(figsize=(8, 4))
sns.barplot(data=grouped, x='year_month', y='order_count', color=plt.cm.viridis(0.6))
# Customize title and labels
plt.title('Number of Orders per Month', fontsize=16)
plt.xlabel('Month', fontsize=12)
plt.ylabel('Number of Orders', fontsize=12)
# Rotate x-axis labels for readability
plt.xticks(rotation=45, ha='right')
# Customize spines
ax = plt.gca()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
plt.show()
# @title Distribution of Total Delivery Times
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
# Suppress warnings
pd.options.mode.chained_assignment = None
# Step 1: Calculate delivery time using .loc to avoid SettingWithCopyWarning
orders.loc[:, 'delivery_time'] = orders['order_delivered_customer_date'] - orders['order_purchase_timestamp']
# Step 2: Convert delivery times to a numeric format (e.g. days) using .loc
orders.loc[:, 'delivery_time_days'] = orders['delivery_time'].dt.total_seconds() / (24 * 3600)
# Set font and style globally
mpl.rcParams.update({
"font.family": "serif",
"axes.titlesize": 16,
"axes.labelsize": 12,
"xtick.labelsize": 10,
"ytick.labelsize": 10,
})
# Plotting with customized style
plt.figure(figsize=(8, 4))
sns.histplot(data=orders, x='delivery_time_days', kde=True, color=plt.cm.viridis(0.5))
# Customize title and labels
plt.title('Distribution of Delivery Time', fontsize=16)
plt.xlabel('Delivery Time (days)', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
# Customize spines
ax = plt.gca()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
plt.show()
Calculating delivery times¶
import pandas as pd
#create a copy for calculations
orders = orders.copy()
# calculate delivery performance metrics
orders['order_processing_time'] = orders['order_approved_at'] - orders['order_purchase_timestamp']
orders['carrier_delivery_time'] = orders['order_delivered_carrier_date'] - orders['order_approved_at']
orders['customer_delivery_time'] = orders['order_delivered_customer_date'] - orders['order_delivered_carrier_date']
orders['time_to_estimate_delivery'] = orders['order_delivered_customer_date'] - orders['order_estimated_delivery_date']
# convert these timedeltas to total days for easier interpretation/ plotting
orders['order_processing_time_days'] = orders['order_processing_time'].dt.total_seconds() / (24 * 3600)
orders['carrier_delivery_time_days'] = orders['carrier_delivery_time'].dt.total_seconds() / (24 * 3600)
orders['customer_delivery_time_days'] = orders['customer_delivery_time'].dt.total_seconds() / (24 * 3600)
orders['time_to_estimate_delivery_days'] = orders['time_to_estimate_delivery'].dt.total_seconds() / (24 * 3600)
# view the metrics from the new DataFrame
orders[['order_id',
'order_processing_time', 'carrier_delivery_time', 'customer_delivery_time',
'time_to_estimate_delivery', 'delivery_time',
'order_processing_time_days', 'carrier_delivery_time_days',
'customer_delivery_time_days', 'time_to_estimate_delivery_days',
'delivery_time_days']].head()
| order_id | order_processing_time | carrier_delivery_time | customer_delivery_time | time_to_estimate_delivery | delivery_time | order_processing_time_days | carrier_delivery_time_days | customer_delivery_time_days | time_to_estimate_delivery_days | delivery_time_days | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | e481f51cbdc54678b7cc49136f2d6af7 | 0 days 00:10:42 | 2 days 08:47:45 | 6 days 01:30:13 | -8 days +21:25:13 | 8 days 10:28:40 | 0.007431 | 2.366493 | 6.062650 | -7.107488 | 8.436574 |
| 1 | 53cdb2fc8bc7dce0b6741e2150273451 | 1 days 06:42:50 | 0 days 11:06:33 | 12 days 00:56:45 | -6 days +15:27:45 | 13 days 18:46:08 | 1.279745 | 0.462882 | 12.039410 | -5.355729 | 13.782037 |
| 2 | 47770eb9100c2d0c44946d9cf07ec65d | 0 days 00:16:34 | 0 days 04:54:37 | 9 days 04:16:29 | -18 days +18:06:29 | 9 days 09:27:40 | 0.011505 | 0.204595 | 9.178113 | -17.245498 | 9.394213 |
| 3 | 949d5b44dbf5de918fe9c16f97b45f8a | 0 days 00:17:53 | 3 days 17:54:00 | 9 days 10:48:43 | -13 days +00:28:42 | 13 days 05:00:36 | 0.012419 | 3.745833 | 9.450498 | -12.980069 | 13.208750 |
| 4 | ad21c59c0840e6cb83a9ceb5573f8159 | 0 days 01:01:50 | 0 days 21:26:05 | 1 days 22:30:28 | -10 days +18:17:02 | 2 days 20:58:23 | 0.042940 | 0.893113 | 1.937824 | -9.238171 | 2.873877 |
# PREPROCESSING TIME
# we can observe that preprocessing time of orders constitute a small part of total delivery time in most cases;
# nevertheless, there are outliers and for some orders the processing time can constitute as much as 60% of total delivery time
# CARRIER TAKEOVER TIME
# we can observe that the time from order processing completion to when the carrier assumes responsibility for the delivery ranges between approx. 18% and 36%;
# similarly, there are outliers with this time constituting more than 60% of total delivery time
# CARRIER DELIVERY TIME (TRANSIT TIME)
# carrier delivery constitutes the largest part of total delivery time - on average approx. 60% - 80%;
# however, there are outliers below 30% of total delivery time
# order delivery time distribution is affected by:
# 1) customer profile - city, zip code (geolocation)
# 2) seller profile - city, zip code (geolocation)
# ---> use the geolocation zip code prefix to match latitude and longitude for each customer and seller
# we can look into other derived attributes:
# for customers:
# 1) the number of orders filled
# 2) the average order size
# 3) the payment type (e.g. credit card)
# 4) the number of payment installements
# 5) products purchased from various sellers (by chance or a particular choice?) (i.e., the number of sellers purchased from)
# 6) customer relationship duration (how long a customer has been ordering from a particular seller)
# for sellers:
# 1) the number of customers/ orders processed
# 2) the length of operation (are there new sellers entering the market or old sellers exiting?)
# 3) customer profiling (a high value customer?)
# for each order we can look at the contribution of different delivery times
orders['processing_share'] = (orders['order_processing_time_days'] / orders['delivery_time_days'])
orders['carrier_share']= (orders['carrier_delivery_time_days'] / orders['delivery_time_days'])
orders['customer_share'] = (orders['customer_delivery_time_days'] / orders['delivery_time_days'])
orders[['order_id', 'processing_share', 'carrier_share', 'customer_share']].head()
| order_id | processing_share | carrier_share | customer_share | |
|---|---|---|---|---|
| 0 | e481f51cbdc54678b7cc49136f2d6af7 | 0.000881 | 0.280504 | 0.718615 |
| 1 | 53cdb2fc8bc7dce0b6741e2150273451 | 0.092856 | 0.033586 | 0.873558 |
| 2 | 47770eb9100c2d0c44946d9cf07ec65d | 0.001225 | 0.021779 | 0.976997 |
| 3 | 949d5b44dbf5de918fe9c16f97b45f8a | 0.000940 | 0.283587 | 0.715473 |
| 4 | ad21c59c0840e6cb83a9ceb5573f8159 | 0.014941 | 0.310770 | 0.674289 |
# for each order we can look at the contribution of different delivery times
# from order_items merge sellers_id to orders dataset
orders_merged = pd.merge(orders, order_items[['order_id', 'seller_id']], on='order_id', how='left')
# box plots of delivery time contributions
cols_to_plot = ['processing_share', 'carrier_share', 'customer_share']
grouped = orders_merged.groupby('seller_id')[cols_to_plot].mean()
# melt dataframe to long format for easier plotting
melted_orders = grouped.melt(value_vars=cols_to_plot, var_name='Delivery Metric', value_name='Share (%) in total delivery time')
# create boxplots
sns.catplot(data=melted_orders, x='Delivery Metric', y='Share (%) in total delivery time', kind='box', height=6, aspect=2)
plt.title('Box Plots of Delivery Times')
plt.xticks(rotation=45) # Rotate x labels for better visibility
plt.show()
cols_to_plot = ['order_processing_time_days', 'carrier_delivery_time_days', 'customer_delivery_time_days', 'time_to_estimate_delivery_days', 'delivery_time_days']
# use sellers ids to groupby the mean delivery times
grouped = orders_merged.groupby('seller_id')[cols_to_plot].mean().reset_index()
grouped
# produce scatter pairplots of different delivery times and the distributions with kde
sns.pairplot(grouped[cols_to_plot], diag_kind='kde')
plt.show()
Data cleaning¶
Removing negative delivery times
# we observe that there are order processing times up to one month (which could be outliers)
# more importantly, there are negative values of carrier delivery time (therefore also in customer and total delivery times) - these errors could be removed if constitute a small part of dataset
# estimated delivery time is skewed towards faster than estimated deliveries
# count the number of records that have either negative processing, carrier delivery, customer delivery or total delivery times
print('Length of dataframe before: ', len(orders))
negative_processing = (orders['order_processing_time_days'] < 0).sum()
negative_carrier = (orders['carrier_delivery_time_days'] < 0).sum()
negative_customer = (orders['customer_delivery_time_days'] < 0).sum()
negative_total = (orders['delivery_time_days'] < 0).sum()
print('Number of negative processing times: ', negative_processing)
print('Number of negative carrier delivery times: ', negative_carrier)
print('Number of negative customer delivery times: ', negative_customer)
print('Number of negative total delivery times: ', negative_total)
# remove records from orders that have negative processing, carrier, customer or total delivery times
orders = orders[orders['order_processing_time_days'] >= 0]
orders = orders[orders['carrier_delivery_time_days'] >= 0]
orders = orders[orders['customer_delivery_time_days'] >= 0]
orders = orders[orders['delivery_time_days'] >= 0]
print('Length of dataframe after: ', len(orders))
Length of dataframe before: 99112 Number of negative processing times: 0 Number of negative carrier delivery times: 1359 Number of negative customer delivery times: 19 Number of negative total delivery times: 0 Length of dataframe after: 94820
Merging datasets¶
print('Dataset shape: ', orders.shape)
Dataset shape: (94820, 22)
Adding customer geolocation data to customers table¶
# to the orders dataset:
# 1) need to merge customers geolocation
# 2) need to merge sellers geolocation
# geolocation contains more than 1M records; the same zip code has several lat/long values (could be different streets, buildings, etc.)
# however, these exact locations don't match to customers/ sellers exactly
# when matching, we take the first occurence of latitude, longitude for each zip code
geolocation_unique = geolocation.drop_duplicates(subset='geolocation_zip_code_prefix')
# merge geolocation data with customers
customers_geo = pd.merge(customers, geolocation_unique[['geolocation_zip_code_prefix', 'geolocation_lat', 'geolocation_lng']],
left_on='customer_zip_code_prefix', right_on='geolocation_zip_code_prefix', how='left')
# rename customers_geo columns with prefixes 'c_'
customers_geo = customers_geo.rename(columns={'geolocation_lat': 'c_geolocation_lat', 'geolocation_lng': 'c_geolocation_lng'})
Adding sellers geolocation data to sellers table¶
# merge geolocation data with sellers
sellers_geo = pd.merge(sellers, geolocation_unique[['geolocation_zip_code_prefix', 'geolocation_lat', 'geolocation_lng']],
left_on='seller_zip_code_prefix', right_on='geolocation_zip_code_prefix', how='left')
# rename sellers_geo columns with prefixes 's_'
sellers_geo = sellers_geo.rename(columns={'geolocation_lat': 's_geolocation_lat', 'geolocation_lng': 's_geolocation_lng'})
Adding order item info (e.g. product id, seller id, price, freight value) to orders table¶
# to orders dataset add product_id, seller_id and price and freight_value from order_items
orders_merge_items = pd.merge(orders, order_items[['order_id', 'product_id', 'seller_id', 'price', 'freight_value']], on='order_id', how='left')
print('Length of the dataframes: ')
print('orders: ', len(orders))
print('orders_merge_items: ', len(orders_merge_items))
Length of the dataframes: orders: 94820 orders_merge_items: 108270
# orders_merge_items contains additional records, because each order can consist of multiple items
# display the count of orders by the number of products it contains
num_items = orders_merge_items['order_id'].value_counts().reset_index().rename(columns={'count': 'num_items_in_order'})
num_orders = num_items.groupby('num_items_in_order')['order_id'].count().reset_index().rename(columns={'order_id': 'num_orders'})
# calculate cumulative percentage
total_orders = num_orders['num_orders'].sum()
num_orders['cumulative_count'] = num_orders['num_orders'].cumsum()
num_orders['cumulative_percentage'] = (num_orders['cumulative_count'] / total_orders * 100).round(2)
num_orders
| num_items_in_order | num_orders | cumulative_count | cumulative_percentage | |
|---|---|---|---|---|
| 0 | 1 | 85373 | 85373 | 90.04 |
| 1 | 2 | 7242 | 92615 | 97.67 |
| 2 | 3 | 1289 | 93904 | 99.03 |
| 3 | 4 | 487 | 94391 | 99.55 |
| 4 | 5 | 188 | 94579 | 99.75 |
| 5 | 6 | 184 | 94763 | 99.94 |
| 6 | 7 | 21 | 94784 | 99.96 |
| 7 | 8 | 8 | 94792 | 99.97 |
| 8 | 9 | 3 | 94795 | 99.97 |
| 9 | 10 | 8 | 94803 | 99.98 |
| 10 | 11 | 4 | 94807 | 99.99 |
| 11 | 12 | 5 | 94812 | 99.99 |
| 12 | 13 | 1 | 94813 | 99.99 |
| 13 | 14 | 2 | 94815 | 99.99 |
| 14 | 15 | 2 | 94817 | 100.00 |
| 15 | 20 | 2 | 94819 | 100.00 |
| 16 | 21 | 1 | 94820 | 100.00 |
# we can observe that 90% of orders are with a single item and 99% of orders are with 3 items or less
# we can also look into an order and the number of different product sellers
num_sellers = orders_merge_items.groupby('order_id')['seller_id'].nunique().reset_index().rename(columns={'seller_id': 'num_sellers_in_order'})
num_orders = num_sellers.groupby('num_sellers_in_order').count().reset_index().rename(columns={'order_id': 'num_orders'})
# calculate cumulative percentage
total_orders = num_orders['num_orders'].sum()
num_orders['cumulative_count'] = num_orders['num_orders'].cumsum()
num_orders['cumulative_percentage'] = (num_orders['cumulative_count'] / total_orders * 100).round(2)
num_orders
| num_sellers_in_order | num_orders | cumulative_count | cumulative_percentage | |
|---|---|---|---|---|
| 0 | 1 | 93600 | 93600 | 98.71 |
| 1 | 2 | 1164 | 94764 | 99.94 |
| 2 | 3 | 53 | 94817 | 100.00 |
| 3 | 4 | 2 | 94819 | 100.00 |
| 4 | 5 | 1 | 94820 | 100.00 |
# we can observe that almost 99% of orders are filled by a single seller (including multiple item orders)
Adding customer geolocation info to core orders table¶
# merge customer geolocation data to orders_merge_items dataset
orders_merge_items_geoc = pd.merge(orders_merge_items, customers_geo[['customer_id', 'c_geolocation_lat', 'c_geolocation_lng']], on='customer_id', how='left')
print('Dataset shape: ', orders_merge_items_geoc.shape)
Dataset shape: (108270, 28)
Adding sellers geolocation info to core orders table¶
# merge seller geolocation data to orders_merge_items dataset
orders_merge_items_geo = pd.merge(orders_merge_items_geoc, sellers_geo[['seller_id', 's_geolocation_lat', 's_geolocation_lng']], on='seller_id', how='left')
Remove any missing values for coordinates (for plotting)
# check for any missing geolocation info
print(customers_geo.isna().sum())
customer_id 0 customer_unique_id 0 customer_zip_code_prefix 0 customer_city 0 customer_state 0 geolocation_zip_code_prefix 278 c_geolocation_lat 278 c_geolocation_lng 278 dtype: int64
# both customers and sellers datasets contain nan values for geolocation
# it could be due to customer and sellers zip codes not included in the geolocation dataset
nan_zip_prefixes = customers_geo[customers_geo['c_geolocation_lat'].isna()]['customer_zip_code_prefix'].unique()
print('Missing geolocation info for these zip codes present in customers dataset: ', nan_zip_prefixes)
print('Number of zip codes missing: ', len(nan_zip_prefixes))
Missing geolocation info for these zip codes present in customers dataset: [72300 11547 64605 72465 7729 72904 35408 78554 73369 8980 29949 65137 28655 73255 28388 6930 71676 64047 61906 83210 71919 36956 35242 72005 29718 41347 70324 70686 72341 12332 70716 71905 75784 73082 71884 71574 72238 71996 76968 71975 72595 72017 72596 67105 25840 72002 72821 85118 25919 95853 72583 68511 70701 71591 72535 95572 73090 72242 86135 70316 73091 41098 58734 73310 71810 72280 7430 73081 70333 72268 35104 72455 72237 17390 76897 84623 70702 72760 73088 29196 36596 57254 71995 73093 75257 48504 83843 62625 37005 73401 49870 13307 28617 73402 56327 71976 72587 85958 19740 77404 44135 28120 72863 87323 87511 72440 72243 65830 71261 28575 2140 71551 72023 28160 55027 43870 94370 38710 42716 36248 71593 71953 72549 72457 56485 71590 93602 7412 8342 39103 72536 59547 59299 85894 36857 71993 7784 71539 42843 86996 73272 72867 27980 58286 71208 71971 73391 64095 72427 38627 62898 71698 12770 72338 68629 28530 55863] Number of zip codes missing: 157
# remove rows with NaN in latitude or longitude columns
orders_merge_items_geo_clean = orders_merge_items_geo.dropna(subset=['c_geolocation_lat', 'c_geolocation_lng', 's_geolocation_lat', 's_geolocation_lng'])
Aggregate order items (number and type of products, the price and freight value) to a core orders table
---> a unit of analysis is a particular order customer places
# in orders_merge_items we have duplicate order_ids as orders are split up based on items
# we want to have a record for each order:
# output average delivery times, sum of price, sum of freight value, for geolocation we can take average (these values are the same for all order ids)
# we also want to preserve customer_id, seller_id (in case of customer we can take first and for seller the mode)
# create an aggregation function mapping based on the data types of the columns
agg_funcs = {}
# numeric columns: we can apply mean or sum
numeric_columns = orders_merge_items_geo_clean.select_dtypes(include=['float64', 'int64', 'timedelta64[ns]']).columns
for col in numeric_columns:
if 'price' in col or 'freight_value' in col: # Use sum for price-related fields
agg_funcs[col] = 'sum'
else: # Use mean for other numeric fields
agg_funcs[col] = 'mean'
# object columns: use mode (most frequent value)
object_columns = orders_merge_items_geo_clean.select_dtypes(include=['object']).columns
for col in object_columns:
if col == 'order_id': # We are grouping by order_id, so no need to aggregate it
continue
elif col == 'customer_id': # Taking first customer_id (assuming one customer per order)
agg_funcs[col] = 'first'
elif col == 'seller_id' or col == 'product_id': # Mode for seller_id and product_id
agg_funcs[col] = lambda x: x.mode()[0]
else: # For other object fields, use the first entry (e.g., order_status)
agg_funcs[col] = 'first'
# Datetime columns: take the first timestamp (chronologically)
datetime_columns = orders_merge_items_geo_clean.select_dtypes(include=['datetime64[ns]', 'period[M]']).columns
for col in datetime_columns:
agg_funcs[col] = 'first'
# Apply the aggregation
orders_merged = orders_merge_items_geo_clean.groupby('order_id').agg(agg_funcs).reset_index()
# Display the first few rows of the result
orders_merged.head()
| order_id | delivery_time | delivery_time_days | order_processing_time | carrier_delivery_time | customer_delivery_time | time_to_estimate_delivery | order_processing_time_days | carrier_delivery_time_days | customer_delivery_time_days | ... | customer_id | order_status | product_id | seller_id | order_purchase_timestamp | order_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date | year_month | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 00010242fe8c5a6d1ba2dd792cb16214 | 7 days 14:44:46 | 7.614421 | 0 days 00:46:33 | 6 days 08:48:41 | 1 days 05:09:32 | -9 days +23:43:48 | 0.032326 | 6.367141 | 1.214954 | ... | 3ce436f183e68e07877b285a838db11a | delivered | 4244733e06e7ecb4970a6e2683c13e61 | 48436dade18ac8b2bce089ec2a041202 | 2017-09-13 08:59:02 | 2017-09-13 09:45:35 | 2017-09-19 18:34:16 | 2017-09-20 23:43:48 | 2017-09-29 | 2017-09 |
| 1 | 00018f77f2f0320c557190d7a144bdd3 | 16 days 05:11:18 | 16.216181 | 0 days 00:12:07 | 8 days 03:29:47 | 8 days 01:29:24 | -3 days +16:04:24 | 0.008414 | 8.145683 | 8.062083 | ... | f6dd3ec061db4e3987629fe6b26e5cce | delivered | e5f2d52b802189ee658865ca93d83a8f | dd7ddc04e1b6c2c614352b383efe2d36 | 2017-04-26 10:53:06 | 2017-04-26 11:05:13 | 2017-05-04 14:35:00 | 2017-05-12 16:04:24 | 2017-05-15 | 2017-04 |
| 2 | 000229ec398224ef6ca0657da4fc703e | 7 days 22:45:45 | 7.948437 | 0 days 00:14:59 | 1 days 21:48:18 | 6 days 00:42:28 | -14 days +13:19:16 | 0.010405 | 1.908542 | 6.029491 | ... | 6489ae5e4333f3693df5ad4372dab6d3 | delivered | c777355d18b72b67abbeef9df44fd0fd | 5b51032eddd242adc84c38acab88f23d | 2018-01-14 14:33:31 | 2018-01-14 14:48:30 | 2018-01-16 12:36:48 | 2018-01-22 13:19:16 | 2018-02-05 | 2018-01 |
| 3 | 00024acbcdf0a6daa1e931b038114c75 | 6 days 03:32:04 | 6.147269 | 0 days 00:09:43 | 2 days 03:17:42 | 4 days 00:04:39 | -6 days +13:32:39 | 0.006748 | 2.137292 | 4.003229 | ... | d4eb9395c8c0431ee92fce09860c5a06 | delivered | 7634da152a4610f1595efa32f14722fc | 9d7a1d34a5052409006425275ba1c2b4 | 2018-08-08 10:00:35 | 2018-08-08 10:10:18 | 2018-08-10 13:28:00 | 2018-08-14 13:32:39 | 2018-08-20 | 2018-08 |
| 4 | 00042b26cf59d7ce69dfabb4e55b4fd9 | 25 days 02:44:40 | 25.114352 | 0 days 00:12:22 | 11 days 19:35:56 | 13 days 06:56:22 | -16 days +16:42:31 | 0.008588 | 11.816620 | 13.289144 | ... | 58dbd0b2d70206bf40e62cd34e84d795 | delivered | ac6c3623068f30de03045865e4e10089 | df560393f3a51e74553ab94004ba5c87 | 2017-02-04 13:57:51 | 2017-02-04 14:10:13 | 2017-02-16 09:46:09 | 2017-03-01 16:42:31 | 2017-03-17 | 2017-02 |
5 rows × 30 columns
Adding relavant payments info (e.g. payment type, # of installments, payment value) to core orders table¶
order_payments['payment_sequential'].value_counts().reset_index()
| payment_sequential | count | |
|---|---|---|
| 0 | 1 | 99360 |
| 1 | 2 | 3039 |
| 2 | 3 | 581 |
| 3 | 4 | 278 |
| 4 | 5 | 170 |
| 5 | 6 | 118 |
| 6 | 7 | 82 |
| 7 | 8 | 54 |
| 8 | 9 | 43 |
| 9 | 10 | 34 |
| 10 | 11 | 29 |
| 11 | 12 | 21 |
| 12 | 13 | 13 |
| 13 | 14 | 10 |
| 14 | 15 | 8 |
| 15 | 18 | 6 |
| 16 | 19 | 6 |
| 17 | 16 | 6 |
| 18 | 17 | 6 |
| 19 | 21 | 4 |
| 20 | 20 | 4 |
| 21 | 22 | 3 |
| 22 | 26 | 2 |
| 23 | 24 | 2 |
| 24 | 23 | 2 |
| 25 | 25 | 2 |
| 26 | 29 | 1 |
| 27 | 28 | 1 |
| 28 | 27 | 1 |
# we need to group order_payments by order_id and sum the payment values
# for payment type we take the most common and for payment installments we take max
# group by 'order_id' and aggregate
order_payments_aggregated = order_payments.groupby('order_id').agg(
total_payment_value=('payment_value', 'sum'), # sum of payment values
payment_type=('payment_type', lambda x: x.mode()[0]), # most common payment type
payment_installments=('payment_installments', 'max') # max number of installments
).reset_index()
order_payments_aggregated.head()
| order_id | total_payment_value | payment_type | payment_installments | |
|---|---|---|---|---|
| 0 | 00010242fe8c5a6d1ba2dd792cb16214 | 72.19 | credit_card | 2 |
| 1 | 00018f77f2f0320c557190d7a144bdd3 | 259.83 | credit_card | 3 |
| 2 | 000229ec398224ef6ca0657da4fc703e | 216.87 | credit_card | 5 |
| 3 | 00024acbcdf0a6daa1e931b038114c75 | 25.78 | credit_card | 2 |
| 4 | 00042b26cf59d7ce69dfabb4e55b4fd9 | 218.04 | credit_card | 3 |
# for each order id we also want to add info from payments dataset
orders_merged = pd.merge(orders_merged, order_payments_aggregated[['order_id', 'payment_type', 'payment_installments', 'total_payment_value']], on='order_id', how='left')
orders_merged.head()
| order_id | delivery_time | delivery_time_days | order_processing_time | carrier_delivery_time | customer_delivery_time | time_to_estimate_delivery | order_processing_time_days | carrier_delivery_time_days | customer_delivery_time_days | ... | seller_id | order_purchase_timestamp | order_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date | year_month | payment_type | payment_installments | total_payment_value | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 00010242fe8c5a6d1ba2dd792cb16214 | 7 days 14:44:46 | 7.614421 | 0 days 00:46:33 | 6 days 08:48:41 | 1 days 05:09:32 | -9 days +23:43:48 | 0.032326 | 6.367141 | 1.214954 | ... | 48436dade18ac8b2bce089ec2a041202 | 2017-09-13 08:59:02 | 2017-09-13 09:45:35 | 2017-09-19 18:34:16 | 2017-09-20 23:43:48 | 2017-09-29 | 2017-09 | credit_card | 2 | 72.19 |
| 1 | 00018f77f2f0320c557190d7a144bdd3 | 16 days 05:11:18 | 16.216181 | 0 days 00:12:07 | 8 days 03:29:47 | 8 days 01:29:24 | -3 days +16:04:24 | 0.008414 | 8.145683 | 8.062083 | ... | dd7ddc04e1b6c2c614352b383efe2d36 | 2017-04-26 10:53:06 | 2017-04-26 11:05:13 | 2017-05-04 14:35:00 | 2017-05-12 16:04:24 | 2017-05-15 | 2017-04 | credit_card | 3 | 259.83 |
| 2 | 000229ec398224ef6ca0657da4fc703e | 7 days 22:45:45 | 7.948437 | 0 days 00:14:59 | 1 days 21:48:18 | 6 days 00:42:28 | -14 days +13:19:16 | 0.010405 | 1.908542 | 6.029491 | ... | 5b51032eddd242adc84c38acab88f23d | 2018-01-14 14:33:31 | 2018-01-14 14:48:30 | 2018-01-16 12:36:48 | 2018-01-22 13:19:16 | 2018-02-05 | 2018-01 | credit_card | 5 | 216.87 |
| 3 | 00024acbcdf0a6daa1e931b038114c75 | 6 days 03:32:04 | 6.147269 | 0 days 00:09:43 | 2 days 03:17:42 | 4 days 00:04:39 | -6 days +13:32:39 | 0.006748 | 2.137292 | 4.003229 | ... | 9d7a1d34a5052409006425275ba1c2b4 | 2018-08-08 10:00:35 | 2018-08-08 10:10:18 | 2018-08-10 13:28:00 | 2018-08-14 13:32:39 | 2018-08-20 | 2018-08 | credit_card | 2 | 25.78 |
| 4 | 00042b26cf59d7ce69dfabb4e55b4fd9 | 25 days 02:44:40 | 25.114352 | 0 days 00:12:22 | 11 days 19:35:56 | 13 days 06:56:22 | -16 days +16:42:31 | 0.008588 | 11.816620 | 13.289144 | ... | df560393f3a51e74553ab94004ba5c87 | 2017-02-04 13:57:51 | 2017-02-04 14:10:13 | 2017-02-16 09:46:09 | 2017-03-01 16:42:31 | 2017-03-17 | 2017-02 | credit_card | 3 | 218.04 |
5 rows × 33 columns
Adding product relavant info to core orders table¶
orders_merged = pd.merge(orders_merged, products[['product_id', 'product_category_name', 'product_weight_g', 'product_length_cm', 'product_height_cm', 'product_width_cm']], on='product_id', how='left')
orders_merged['product_vol'] = orders_merged['product_length_cm'] * orders_merged['product_height_cm'] * orders_merged['product_width_cm']
orders_merged.head()
| order_id | delivery_time | delivery_time_days | order_processing_time | carrier_delivery_time | customer_delivery_time | time_to_estimate_delivery | order_processing_time_days | carrier_delivery_time_days | customer_delivery_time_days | ... | year_month | payment_type | payment_installments | total_payment_value | product_category_name | product_weight_g | product_length_cm | product_height_cm | product_width_cm | product_vol | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 00010242fe8c5a6d1ba2dd792cb16214 | 7 days 14:44:46 | 7.614421 | 0 days 00:46:33 | 6 days 08:48:41 | 1 days 05:09:32 | -9 days +23:43:48 | 0.032326 | 6.367141 | 1.214954 | ... | 2017-09 | credit_card | 2 | 72.19 | cool_stuff | 650.0 | 28.0 | 9.0 | 14.0 | 3528.0 |
| 1 | 00018f77f2f0320c557190d7a144bdd3 | 16 days 05:11:18 | 16.216181 | 0 days 00:12:07 | 8 days 03:29:47 | 8 days 01:29:24 | -3 days +16:04:24 | 0.008414 | 8.145683 | 8.062083 | ... | 2017-04 | credit_card | 3 | 259.83 | pet_shop | 30000.0 | 50.0 | 30.0 | 40.0 | 60000.0 |
| 2 | 000229ec398224ef6ca0657da4fc703e | 7 days 22:45:45 | 7.948437 | 0 days 00:14:59 | 1 days 21:48:18 | 6 days 00:42:28 | -14 days +13:19:16 | 0.010405 | 1.908542 | 6.029491 | ... | 2018-01 | credit_card | 5 | 216.87 | moveis_decoracao | 3050.0 | 33.0 | 13.0 | 33.0 | 14157.0 |
| 3 | 00024acbcdf0a6daa1e931b038114c75 | 6 days 03:32:04 | 6.147269 | 0 days 00:09:43 | 2 days 03:17:42 | 4 days 00:04:39 | -6 days +13:32:39 | 0.006748 | 2.137292 | 4.003229 | ... | 2018-08 | credit_card | 2 | 25.78 | perfumaria | 200.0 | 16.0 | 10.0 | 15.0 | 2400.0 |
| 4 | 00042b26cf59d7ce69dfabb4e55b4fd9 | 25 days 02:44:40 | 25.114352 | 0 days 00:12:22 | 11 days 19:35:56 | 13 days 06:56:22 | -16 days +16:42:31 | 0.008588 | 11.816620 | 13.289144 | ... | 2017-02 | credit_card | 3 | 218.04 | ferramentas_jardim | 3750.0 | 35.0 | 40.0 | 30.0 | 42000.0 |
5 rows × 39 columns
Adding review score to core orders table¶
# add the review score to see if reviews somehow correlate with delivery times/ delivery performance metrics
orders_merged = pd.merge(orders_merged, reviews[['order_id', 'review_score']], on='order_id', how='left')
orders_merged.head()
| order_id | delivery_time | delivery_time_days | order_processing_time | carrier_delivery_time | customer_delivery_time | time_to_estimate_delivery | order_processing_time_days | carrier_delivery_time_days | customer_delivery_time_days | ... | payment_type | payment_installments | total_payment_value | product_category_name | product_weight_g | product_length_cm | product_height_cm | product_width_cm | product_vol | review_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 00010242fe8c5a6d1ba2dd792cb16214 | 7 days 14:44:46 | 7.614421 | 0 days 00:46:33 | 6 days 08:48:41 | 1 days 05:09:32 | -9 days +23:43:48 | 0.032326 | 6.367141 | 1.214954 | ... | credit_card | 2 | 72.19 | cool_stuff | 650.0 | 28.0 | 9.0 | 14.0 | 3528.0 | 5.0 |
| 1 | 00018f77f2f0320c557190d7a144bdd3 | 16 days 05:11:18 | 16.216181 | 0 days 00:12:07 | 8 days 03:29:47 | 8 days 01:29:24 | -3 days +16:04:24 | 0.008414 | 8.145683 | 8.062083 | ... | credit_card | 3 | 259.83 | pet_shop | 30000.0 | 50.0 | 30.0 | 40.0 | 60000.0 | 4.0 |
| 2 | 000229ec398224ef6ca0657da4fc703e | 7 days 22:45:45 | 7.948437 | 0 days 00:14:59 | 1 days 21:48:18 | 6 days 00:42:28 | -14 days +13:19:16 | 0.010405 | 1.908542 | 6.029491 | ... | credit_card | 5 | 216.87 | moveis_decoracao | 3050.0 | 33.0 | 13.0 | 33.0 | 14157.0 | 5.0 |
| 3 | 00024acbcdf0a6daa1e931b038114c75 | 6 days 03:32:04 | 6.147269 | 0 days 00:09:43 | 2 days 03:17:42 | 4 days 00:04:39 | -6 days +13:32:39 | 0.006748 | 2.137292 | 4.003229 | ... | credit_card | 2 | 25.78 | perfumaria | 200.0 | 16.0 | 10.0 | 15.0 | 2400.0 | 4.0 |
| 4 | 00042b26cf59d7ce69dfabb4e55b4fd9 | 25 days 02:44:40 | 25.114352 | 0 days 00:12:22 | 11 days 19:35:56 | 13 days 06:56:22 | -16 days +16:42:31 | 0.008588 | 11.816620 | 13.289144 | ... | credit_card | 3 | 218.04 | ferramentas_jardim | 3750.0 | 35.0 | 40.0 | 30.0 | 42000.0 | 5.0 |
5 rows × 40 columns
Calculating derived attributes¶
Calculating other derived attributes¶
Preprocessing time
Carrier takeover time
Customer delivery time
Total delivery time
Time to estimate days
Late deliveries (%)
Distance (km) (using Haversine formula)
Distance efficiency (distance_km / delivery_time_days) ---> higher values indicate faster deliveries across distances
Freight cost per km (freight_value / distance_km) ---> identifies how freight scales with distance
Price to distance ratio (price / distance_km) ---> evaluate if higher-priced products tend to be delivered over longer or shorter distances
Cost efficiency (freight_value / delivery_time_days) ---> how efficiently freight costs are being used relative to the delivery speed
Order size per km (price / distance_km) ---> whether higher priced products tend to have longer delivery
Total order cost per km (total_payment_value / distance_km)
Freight share (freight_value / total_payment_value)
Preprocessing time % (order_processing_time_days / delivery_time_days)
Carrier takeover time % (carrier_delivery_time_days / delivery_time_days)
Customer delivery time % (customer_delivery_time_days / delivery_time_days)
# @title Calculations
# function to calculate distance using Haversine formula
import numpy as np
def haversine(lat1, lon1, lat2, lon2):
R = 6371 # Earth radius in kilometers
phi1, phi2 = np.radians(lat1), np.radians(lat2)
delta_phi = np.radians(lat2 - lat1)
delta_lambda = np.radians(lon2 - lon1)
a = np.sin(delta_phi / 2)**2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda / 2)**2
c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
return R * c # Distance in kilometers
# calculate distances for each order
orders_merged['distance_km'] = haversine(
orders_merged['c_geolocation_lat'],
orders_merged['c_geolocation_lng'],
orders_merged['s_geolocation_lat'],
orders_merged['s_geolocation_lng']
)
# other metrics
# 1) distance_efficiency = distance_km / delivery_time_days ---> higher values indicate faster deliveries across distances
orders_merged['distance_efficiency'] = orders_merged['distance_km'] / orders_merged['delivery_time_days']
# 2) freight_cost_per_km = total_freight_value / distance_km ---> identifies how freight cost scales with distance (can identify costly deliveries)
orders_merged['freight_cost_per_km'] = orders_merged['freight_value'] / orders_merged['distance_km']
# 3) price_to_distance = total_price / distance_km ---> evaluate if higher-priced products tend to be delivered over longer or shorter distances
orders_merged['price_to_distance'] = orders_merged['price'] / orders_merged['distance_km']
# 4) cost_efficiency = total_freight_value / delivery_time_days --> how efficiently freight costs are being used relative to the delivery speed
orders_merged['cost_efficiency'] = orders_merged['freight_value'] / orders_merged['delivery_time_days']
# 5) order_size_per_km = total_price / distance_km ---> whether higher priced products tend to have longer delivery
orders_merged['order_size_per_km'] = orders_merged['price'] / orders_merged['distance_km']
# 6) total_order_cost_per_km = total_payment_value / distance_km
orders_merged['total_order_cost_per_km'] = orders_merged['total_payment_value'] / orders_merged['distance_km']
# 7) the share of freight cost in total order value
orders_merged['freight_share'] = orders_merged['freight_value'] / orders_merged['total_payment_value']
# 8) late_delivery (1 for late and 0 for on-time deliveries)
orders_merged['late_delivery'] = orders_merged['time_to_estimate_delivery_days'].apply(lambda x: 1 if x > 0 else 0)
Removal of outliers¶
# @title Box Plots of Delivery Times
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
# Set global font and style for the plot
mpl.rcParams.update({
"font.family": "serif",
"axes.titlesize": 16,
"axes.labelsize": 12,
"xtick.labelsize": 10,
"ytick.labelsize": 10,
})
# Define the relevant columns for outlier detection
columns_to_check = [
'delivery_time_days', 'order_processing_time_days', 'carrier_delivery_time_days',
'customer_delivery_time_days', 'time_to_estimate_delivery_days', 'distance_km'
]
# Set up the figure for multiple subplots
plt.figure(figsize=(15, 10))
viridis_colors = plt.cm.viridis(np.linspace(0, 1, len(columns_to_check)))
# Create individual box plots for each column
for i, col in enumerate(columns_to_check, 1):
plt.subplot(3, (len(columns_to_check) + 2) // 3, i)
sns.boxplot(data=orders_merged[col], orient='v', color=viridis_colors[i-1]) # Use Viridis color
# Set title for each subplot
plt.title(col, fontsize=12)
# Customize spines for each subplot
ax = plt.gca()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
# Adjust layout and display the plot
plt.tight_layout()
plt.show()
# @title Removing outliers
# Function to remove outliers using IQR method
def remove_outliers_iqr(df, columns):
# Create a copy to avoid modifying the original dataframe
df_clean = df.copy()
# Loop through each selected column to remove outliers
for col in columns:
Q1 = df_clean[col].quantile(0.25) # 1st quartile (25th percentile)
Q3 = df_clean[col].quantile(0.75) # 3rd quartile (75th percentile)
IQR = Q3 - Q1 # Interquartile Range
# Define the lower and upper bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
# Filter out the outliers
df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)]
return df_clean
# apply the outlier removal function to the selected columns
orders_cleaned = remove_outliers_iqr(orders_merged, columns_to_check)
# display the cleaned dataframe
print("Dataset shape after:", orders_cleaned.shape)
Dataset shape after: (68055, 49)
Inspecting derived attributes¶
- some of the metrics can contain +inf or -inf values (e.g., price to distance) is close to 0 km (perhaps 400m)
---> therefore, we need to trim these values
- we can use +/- 3 std as a replacament value
import numpy as np
def replace_inf_with_thresholds(df, metrics):
"""
Replace +inf with (mean + 3*std) and -inf with (mean - 3*std) for the specified metrics in the DataFrame.
"""
df_copy = df.copy() # Work on a copy of the DataFrame
for metric in metrics:
# Create a temporary copy of the metric column and replace inf values with NaN
temp_metric = df_copy[metric].copy()
temp_metric.replace([np.inf, -np.inf], np.nan, inplace=True)
# Calculate mean and std, excluding NaN values
mean_value = temp_metric.mean()
std_value = temp_metric.std()
# Replace +inf values with mean + 3 * std
df_copy[metric] = np.where(
df_copy[metric] == np.inf, mean_value + 3 * std_value, df_copy[metric]
)
# Replace -inf values with mean - 3 * std
df_copy[metric] = np.where(
df_copy[metric] == -np.inf, mean_value - 3 * std_value, df_copy[metric]
)
return df_copy
# List of metrics to clean from inf values
metrics_to_clean = [
'freight_cost_per_km', 'price_to_distance', 'cost_efficiency',
'order_size_per_km', 'total_order_cost_per_km', 'freight_share', 'review_score'
]
# Apply the function to the dataframe with the listed metrics
orders_cleaned = replace_inf_with_thresholds(orders_cleaned, metrics_to_clean)
Exploratory Data Analysis¶
Correlation matrix¶
# @title Correlation matrix
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
# Define the columns for the correlation matrix
cols = [
'processing_share', 'carrier_share', 'customer_share', 'price',
'freight_value', 'payment_installments', 'total_payment_value',
'distance_km', 'distance_efficiency', 'freight_cost_per_km',
'price_to_distance', 'cost_efficiency', 'order_size_per_km',
'total_order_cost_per_km', 'freight_share', 'product_weight_g',
'product_length_cm', 'product_height_cm', 'product_width_cm',
'product_vol', 'review_score', 'delivery_time_days', 'order_processing_time_days',
'carrier_delivery_time_days', 'customer_delivery_time_days', 'time_to_estimate_delivery_days'
]
# Calculate the correlation matrix
corr_matrix = orders_cleaned[cols].corr()
# Create a mask for the upper triangle
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
# Further increase the size of the heatmap and format annotations
plt.figure(figsize=(30, 18)) # Larger figure size for better readability
# Create the heatmap object with Viridis color palette, limited decimal places, and increased font size
heatmap = sns.heatmap(
corr_matrix,
annot=True,
fmt=".2f", # Limit annotations to two decimal places
annot_kws={"size": 10}, # Further increase annotation font size
cmap="viridis", # Set the colormap to Viridis
mask=mask,
square=True,
cbar_kws={"shrink": .75}
)
# Set the title of the heatmap
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize': 16}, pad=16)
# Show the plot
plt.show()
# we can observe that product price is positively correlated with distance in km (0.32) and distance efficiency (0.27) ---> higher priced products tend to be delivered faster
# negatively correalted with freight cost per km (-0.33) ---> freight cost scales well with distance for higher priced products (well optimised) --> freight cost contribute less to total cost
# review score is slightly positively correlated with distance efficiency (0.09) and cost efficiency (0.08)
# and negatively correlated with customer delivery time and time to estimate delivery
Late vs. On-time Deliveries¶
# @title Key Performance Metrics for Late vs. On-Time Deliveries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Set the color palette to Viridis for better visibility
colors = sns.color_palette("viridis", 2) # Using two shades from the Viridis palette
# Create a copy of the DataFrame to ensure we're working on a clean dataset
df_copy = orders_cleaned.copy()
# Create a binary column for late deliveries
df_copy['late_delivery'] = df_copy['time_to_estimate_delivery_days'].apply(lambda x: 1 if x > 0 else 0)
# Step 1: Get the top 10 product categories based on total delivery counts
top_10_categories = df_copy['product_category_name'].value_counts().nlargest(10).index
# List of metrics to plot
metrics = [
'total_payment_value', 'price', 'freight_value', 'distance_km',
'review_score', 'distance_efficiency', 'freight_cost_per_km',
'price_to_distance', 'cost_efficiency', 'order_size_per_km',
'total_order_cost_per_km', 'freight_share'
]
# Category mapping for English translation
category_mapping = {
'cama_mesa_banho': 'Bedding, Table, and Bath',
'beleza_saude': 'Beauty and Health',
'esporte_lazer': 'Sports and Leisure',
'utilidades_domesticas': 'Household Utilities',
'moveis_decoracao': 'Furniture and Decoration',
'informatica_acessorios': 'Computers and Accessories',
'relogios_presentes': 'Watches and Gifts',
'brinquedos': 'Toys',
'telefonia': 'Telephony',
'automotivo': 'Automotive'
}
# Step 2: Handle infinite values before plotting
for metric in metrics:
# Calculate mean and standard deviation for the metric
mean_value = df_copy[metric].mean()
std_value = df_copy[metric].std()
# Replace inf values with NaN
df_copy[metric] = df_copy[metric].replace([np.inf, -np.inf], np.nan)
# Replace -inf with -3 * std and +inf with +3 * std
df_copy[metric] = np.where(df_copy[metric] == np.inf, mean_value + 3 * std_value, df_copy[metric])
df_copy[metric] = np.where(df_copy[metric] == -np.inf, mean_value - 3 * std_value, df_copy[metric])
# Step 3: Create subplots for each metric
fig, axs = plt.subplots(nrows=len(metrics), ncols=1, figsize=(14, 6 * len(metrics)))
# Iterate over each metric
for i, metric in enumerate(metrics):
# Group by product_category_name and late deliveries to calculate mean for the current metric
mean_values = df_copy.groupby(['product_category_name', 'late_delivery'])[metric].mean().reset_index()
# Filter the mean values for only the top 10 categories
mean_values = mean_values[mean_values['product_category_name'].isin(top_10_categories)]
# Reshape the DataFrame for easier plotting
mean_values_long = mean_values.pivot(index='product_category_name',
columns='late_delivery',
values=metric).reset_index()
# Rename columns for clarity
mean_values_long.columns = ['product_category_name', 'On Time', 'Late']
# Map the product category names to English
mean_values_long['product_category_name'] = mean_values_long['product_category_name'].map(category_mapping)
# Create the bar plot
mean_values_long.set_index('product_category_name').plot(kind='bar', ax=axs[i], color=colors)
# Adding titles and labels
axs[i].set_title(f'Mean {metric.replace("_", " ").title()} for Top 10 Product Categories')
axs[i].set_xlabel('Product Categories')
axs[i].set_ylabel(f'Mean {metric.replace("_", " ").title()}')
axs[i].set_xticks(range(len(mean_values_long['product_category_name'])))
axs[i].set_xticklabels(mean_values_long['product_category_name'], rotation=45)
axs[i].legend(title='Delivery Status', labels=['On Time', 'Late'])
# Remove the top, left, and right spines
axs[i].spines['top'].set_visible(False)
axs[i].spines['left'].set_visible(False)
axs[i].spines['right'].set_visible(False)
# Adjust layout to prevent overlap
plt.tight_layout()
# Show the plot
plt.show()
Largest Cities Analysis¶
# here we could produce more aggregated stats and explore differences across cities, product categories, sellers
# 1) groupby city (see average delivery time, late deliveries %, total payment value, freight share)
# @title Assigning city labels based on geographic coordinates
# add a city label based on predefined centered city coordinates
import pandas as pd
from geopy.distance import geodesic
# Define cities coordinates (lat, lng)
cities_coords = {
'Rio de Janeiro': (-22.9068, -43.1729),
'Sao Paulo': (-23.5505, -46.6333),
'Porto Alegre': (-30.0346, -51.2177),
'Curitiba': (-25.4284, -49.2733),
#'Fortaleza': (-3.7172, -38.5434),
'Salvador': (-12.9714, -38.5014),
'Brasilia': (-15.7942, -47.8822)
}
# Function to determine city label based on coordinates
def label_city(row):
for city, coords in cities_coords.items():
if geodesic((row['c_geolocation_lat'], row['c_geolocation_lng']), coords).km <= 50: # 50 km radius
return city
return None # Return None if no city is found within the radius
# Create a new column in orders_cleaned for city labels
orders_cleaned['city'] = orders_cleaned.apply(label_city, axis=1)
# @title Delivery Times by Largest Cities
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
# Define metrics for aggregation
metrics = ['order_id', 'delivery_time_days', 'late_delivery', 'total_payment_value', 'freight_share']
# Group by city and calculate the mean for each metric
grouped = orders_cleaned.groupby('city')[metrics].agg(
{'order_id': 'count', 'delivery_time_days': 'mean', 'late_delivery': 'mean', 'total_payment_value': 'mean', 'freight_share': 'mean'}
).reset_index().rename(columns={'order_id': 'count'})
# Set up plot style and font properties
plt.rcParams.update({
"font.family": "serif",
"axes.titlesize": 14,
"axes.labelsize": 12,
"xtick.labelsize": 10,
"ytick.labelsize": 10,
})
# Convert the color palette to a list and limit the colors to the number of cities
viridis_colors = list(plt.cm.viridis(np.linspace(0, 1, len(grouped['city'].unique()))))
# Create four bar subplots with custom formatting
plt.figure(figsize=(14, 10))
for i, metric in enumerate(metrics[1:]):
ax = plt.subplot(2, 2, i+1)
sns.barplot(data=grouped, x='city', y=metric, hue="city", palette=viridis_colors[:len(grouped['city'].unique())], legend=False)
ax.set_title(metric.replace('_', ' ').capitalize())
ax.set_xlabel('City')
ax.set_ylabel(metric.replace('_', ' ').capitalize())
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
# 2) in rows we have cities and in columns we have product categories
# the matrix can display:
# the count of orders ---> this can tell whether certain product categories are ordered more or less often from remote areas
# average delivery time (in days) ---> whether delivery time differs for certain cities/ products
# delivery delay % ---> the same
# @title Key Sales Metric Comparison Across Cities - Heatmaps
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Assuming 'orders_cleaned' is already labeled with cities
# Step 1: Group by city and product category, including additional metrics
grouped = orders_cleaned.groupby(['city', 'product_category_name']).agg(
order_count=('order_id', 'count'),
avg_delivery_time=('delivery_time_days', 'mean'),
late_delivery_count=('late_delivery', lambda x: (x == 1).sum()),
freight_cost_per_km=('freight_cost_per_km', 'mean'),
price_to_distance=('price_to_distance', 'mean'),
cost_efficiency=('cost_efficiency', 'mean'),
order_size_per_km=('order_size_per_km', 'mean'),
total_order_cost_per_km=('total_order_cost_per_km', 'mean'),
freight_share=('freight_share', 'mean'),
review_score=('review_score', 'mean')
).reset_index()
# Step 2: Calculate delivery delay percentage
grouped['delivery_delay_percentage'] = (grouped['late_delivery_count'] / grouped['order_count']) * 100
# Step 3: Get top 10 product categories based on order count
top_10_product_categories = grouped.groupby('product_category_name')['order_count'].sum().nlargest(10).index.tolist()
# Filter grouped data for top 10 product categories
grouped_top_10 = grouped[grouped['product_category_name'].isin(top_10_product_categories)]
# Step 5: Create a mapping for product categories
category_mapping = {
'automotivo': 'Automotive',
'beleza_saude': 'Beauty & Health',
'brinquedos': 'Toys',
'cama_mesa_banho': 'Bedding, Table & Bath',
'esporte_lazer': 'Sports & Leisure',
'informatica_acessorios': 'Computers & Accessories',
'moveis_decoracao': 'Furniture & Decoration',
'relogios_presentes': 'Watches & Gifts',
'telefonia': 'Telephony',
'utilidades_domesticas': 'Household Items'
}
# Step 6: Metrics and formatting information
metrics = {
'order_count': ('Order Count', 0),
'avg_delivery_time': ('Average Delivery Time (Days)', 2),
'delivery_delay_percentage': ('Delivery Delay Percentage (%)', 2),
'freight_cost_per_km': ('Freight Cost per Km', 2),
'price_to_distance': ('Price to Distance', 2),
'cost_efficiency': ('Cost Efficiency', 2),
'order_size_per_km': ('Order Size per Km', 2),
'total_order_cost_per_km': ('Total Order Cost per Km', 2),
'freight_share': ('Freight Share', 2),
'review_score': ('Review Score', 2)
}
# Define color map notes for matrix-wide and column-wise scaling
matrix_note = "*Color map applied for whole matrix, comparing all cities and product categories"
column_note = "*Color map applied column-wise, comparing cities per product category"
# Step 7: Loop through each metric, create pivot tables, and display as heatmaps with appropriate scaling
for i, (metric, (title, decimal_places)) in enumerate(metrics.items()):
# Create a pivot table
pivot_table = grouped_top_10.pivot(index='city', columns='product_category_name', values=metric).fillna(0)
pivot_table.rename(columns=category_mapping, inplace=True)
pivot_table.columns.name = None # Flatten the MultiIndex
# Decide on normalization type and set appropriate note
if i < 2: # For the first two metrics, normalize across the entire matrix
normed_pivot = (pivot_table - pivot_table.values.min()) / (pivot_table.values.max() - pivot_table.values.min())
color_map_note = matrix_note
else: # For other metrics, normalize each column (product category) independently for column-wise coloring
normed_pivot = pivot_table.apply(lambda x: (x - x.min()) / (x.max() - x.min()), axis=0)
color_map_note = column_note
# Plot the heatmap with normalized values and original values as annotations
plt.figure(figsize=(12, 8))
ax = sns.heatmap(
normed_pivot,
cmap='viridis',
annot=pivot_table.round(decimal_places), # Use original values for annotations
fmt=f".{decimal_places}f",
cbar_kws={'label': title}
)
plt.title(f"{title} by City and Product Category")
plt.xlabel("Product Category")
plt.ylabel("City")
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
# Adjust the colorbar position
plt.subplots_adjust(right=0.85)
colorbar = ax.collections[0].colorbar
# Place the color map note slightly to the right and center it vertically with the color bar
colorbar.ax.text(
3.0, # Adjust to move right
0.5, # Center vertically
color_map_note,
fontsize=10,
color='black',
ha='left',
va='center', # Center alignment vertically
transform=colorbar.ax.transAxes
)
plt.show()
# 3) in rows 10-20 largest sellers and in columns: num_orders, average order size (payment_value), average delivery time, average freight share
# @title Top Sellers with Key Metrics - Heatmap
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Assuming 'orders_cleaned' includes all necessary seller-level data
# Step 1: Group by seller and aggregate metrics
grouped_sellers = orders_cleaned.groupby('seller_id').agg(
order_count=('order_id', 'count'),
avg_order_size=('total_payment_value', 'mean'),
avg_delivery_time=('delivery_time_days', 'mean'),
avg_freight_share=('freight_share', 'mean'),
freight_cost_per_km=('freight_cost_per_km', 'mean'),
price_to_distance=('price_to_distance', 'mean'),
cost_efficiency=('cost_efficiency', 'mean'),
order_size_per_km=('order_size_per_km', 'mean'),
total_order_cost_per_km=('total_order_cost_per_km', 'mean'),
late_delivery_count=('late_delivery', lambda x: (x == 1).sum()),
review_score=('review_score', 'mean')
).reset_index()
# Step 2: Calculate delivery delay percentage
grouped_sellers['delivery_delay_percentage'] = (grouped_sellers['late_delivery_count'] / grouped_sellers['order_count']) * 100
# Step 3: Sort sellers by order count and select top 30 sellers
top_sellers = grouped_sellers.sort_values(by='order_count', ascending=False).head(30)
# Step 4: Replace seller_id with rank based on order count
top_sellers['Rank'] = range(1, len(top_sellers) + 1)
top_sellers.set_index('Rank', inplace=True)
# Step 5: Select metrics for heatmap
metrics = [
'order_count', 'avg_order_size', 'avg_delivery_time', 'avg_freight_share',
'freight_cost_per_km', 'price_to_distance', 'cost_efficiency', 'order_size_per_km',
'total_order_cost_per_km', 'delivery_delay_percentage', 'review_score'
]
pivot_data = top_sellers[metrics]
# Define which metrics are "negative" where higher values are not preferred
negative_metrics = ['avg_delivery_time', 'avg_freight_share', 'freight_cost_per_km',
'price_to_distance', 'total_order_cost_per_km', 'delivery_delay_percentage']
# Step 6: Normalize each column for per-column color scaling, inverting where higher values are not beneficial
normed_data = pivot_data.apply(lambda x: (x.max() - x) / (x.max() - x.min()) if x.name in negative_metrics
else (x - x.min()) / (x.max() - x.min()), axis=0)
# Step 7: Format data for annotation
formatted_data = pivot_data.copy()
formatted_data['order_count'] = pivot_data['order_count'].astype(int) # Format as integer for order_count
formatted_data = formatted_data.apply(lambda x: x.map('{:.2f}'.format) if x.name != 'order_count' else x.map('{:.0f}'.format))
# Step 8: Plot the heatmap with per-column scaling
plt.figure(figsize=(12, 8))
sns.heatmap(
normed_data,
cmap='viridis',
annot=formatted_data, # Display formatted values
fmt="", # Disable scientific notation
cbar_kws={'label': 'Normalized Scale (Per Column)'}
)
# Add title and labels
plt.title("Top 30 Sellers with Key Metrics - Heatmap")
plt.xlabel("Metric")
plt.ylabel("Largest Sellers (Rank)")
# Customize tick label rotation
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
# Adjust layout
plt.tight_layout()
plt.show()
# @title Delivery Performance Metrics by Product Category by Late and On-Time Deliveries - Heatmaps
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Step 1: Aggregate by product category and late delivery (with 'order_count' as sum)
aggregated_data = orders_cleaned.groupby(['product_category_name', 'late_delivery']).agg(
order_count=('order_id', 'sum'), # sum for order_count
avg_order_size=('total_payment_value', 'mean'),
avg_delivery_time=('delivery_time_days', 'mean'),
avg_freight_share=('freight_share', 'mean'),
freight_cost_per_km=('freight_cost_per_km', 'mean'),
price_to_distance=('price_to_distance', 'mean'),
cost_efficiency=('cost_efficiency', 'mean'),
order_size_per_km=('order_size_per_km', 'mean'),
total_order_cost_per_km=('total_order_cost_per_km', 'mean'),
review_score=('review_score', 'mean')
).reset_index()
# Step 2: Calculate the top 10 categories by total order count and filter the data
top_10_categories = aggregated_data.groupby('product_category_name')['order_count'].sum().nlargest(10).index.tolist()
filtered_data = aggregated_data[aggregated_data['product_category_name'].isin(top_10_categories)]
# Step 3: Pivot the DataFrame with delivery status as the second level of columns
pivoted_data = filtered_data.pivot_table(
index='product_category_name',
columns='late_delivery',
values=[
'avg_order_size',
'avg_delivery_time',
'avg_freight_share',
'freight_cost_per_km',
'price_to_distance',
'cost_efficiency',
'order_size_per_km',
'total_order_cost_per_km',
'review_score'
],
aggfunc='mean'
)
# Step 4: Rename columns for clarity, avoiding `KeyError`
pivoted_data.columns = pd.MultiIndex.from_tuples(
[(metric, 'On-Time' if late_deliv == 0 else 'Late') for metric, late_deliv in pivoted_data.columns],
names=["Metric", "Delivery Status"]
)
# Step 5: Reset index to bring 'product_category_name' into the DataFrame
pivoted_data.reset_index(inplace=True)
# Step 6: Translate product category names to English
translation_mapping = {
'cama_mesa_banho': 'Bedding and Bath',
'beleza_saude': 'Beauty and Health',
'esporte_lazer': 'Sports and Leisure',
'utilidades_domesticas': 'Household Utilities',
'informatica_acessorios': 'Computers and Accessories',
'moveis_decoracao': 'Furniture and Decoration',
'relogios_presentes': 'Watches and Gifts',
'brinquedos': 'Toys',
'telefonia': 'Telephony',
'automotivo': 'Automotive'
}
pivoted_data['product_category_name'] = pivoted_data['product_category_name'].map(translation_mapping).fillna(pivoted_data['product_category_name'])
# Step 7: Order product categories by total order count
total_order_counts = aggregated_data.groupby('product_category_name')['order_count'].sum()
sorted_categories = total_order_counts[top_10_categories].sort_values(ascending=False).index
pivoted_data['product_category_name'] = pd.Categorical(
pivoted_data['product_category_name'], categories=sorted_categories, ordered=True
)
pivoted_data = pivoted_data.sort_values('product_category_name')
# Step 8: Reshape data to prepare for heatmap
final_table = pivoted_data.set_index('product_category_name').stack(level=0, future_stack=True).unstack(level=1)
# Step 9: Separate data for On-Time and Late deliveries
on_time_data = final_table.xs('On-Time', level='Delivery Status', axis=1)
late_data = final_table.xs('Late', level='Delivery Status', axis=1)
# Step 10: Normalize each column independently for column-wise color scaling
on_time_normalized = on_time_data.apply(lambda x: (x - x.min()) / (x.max() - x.min()), axis=0)
late_normalized = late_data.apply(lambda x: (x - x.min()) / (x.max() - x.min()), axis=0)
# Step 11: Plot heatmaps for On-Time and Late deliveries side by side with column-wise normalization
plt.figure(figsize=(15, 10))
sns.heatmap(on_time_normalized, cmap='viridis', annot=on_time_data, fmt=".2f", cbar=True)
plt.title("Delivery Performance Metrics by Product Category - On-Time Deliveries")
plt.xticks(rotation=45)
plt.show()
plt.figure(figsize=(15, 10))
sns.heatmap(late_normalized, cmap='viridis', annot=late_data, fmt=".2f", cbar=True)
plt.title("Delivery Performance Metrics by Product Category - Late Deliveries")
plt.xticks(rotation=45)
plt.show()
Exploration of the largest cities¶
Rio de Janeiro, Sao Paulo, Porto Alegre, Curitiba, Fortaleza, Bahia, Brasilia
Customer Geolocation¶
Total Delivery Time¶
# Exploring delivery times based on customer location is essential for several reasons:
#
# 1. Logistics Optimization: Delivery times vary due to geographic factors (distance, traffic, infrastructure).
# By analyzing regional patterns, companies can identify areas where delays occur and optimize routing,
# adjust warehouse placement, or streamline processes for better efficiency.
#
# 2. Customer Experience: Understanding location-based delivery performance enables companies to set more
# accurate delivery expectations, improving customer satisfaction and reducing complaints about late deliveries.
#
# 3. Cost Management: By pinpointing regions with frequent delays, companies can assess whether the delivery
# costs are proportionate to customer locations and adjust pricing, warehousing, or transportation strategies.
All customers¶
# @title Delivery Performance in Major Brazilian Cities (Dot Size by Customer Payment Value)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from geopy.distance import geodesic
import numpy as np
# Define cities coordinates (latitude, longitude)
cities_coords = {
'Rio de Janeiro': (-22.9068, -43.1729),
'Sao Paulo': (-23.5505, -46.6333),
'Porto Alegre': (-30.0346, -51.2177),
'Curitiba': (-25.4284, -49.2733),
'Fortaleza': (-3.7172, -38.5434),
'Bahia': (-12.9714, -38.5014),
'Brasilia': (-15.7942, -47.8822)
}
# Haversine distance function to filter data for a specific city within a radius
def filter_city_data(city_coords, radius_km=50):
lat, lng = city_coords
city_data = orders_cleaned[
orders_cleaned.apply(
lambda row: geodesic((row['c_geolocation_lat'], row['c_geolocation_lng']), (lat, lng)).km <= radius_km,
axis=1
)
].copy() # Use .copy() to avoid SettingWithCopyWarning
return city_data
# Calculate global min and max delivery times across all cities for consistent color mapping
all_delivery_times = []
for coords in cities_coords.values():
city_data = filter_city_data(coords)
all_delivery_times.extend(city_data['delivery_time_days'].dropna())
# Get the global min and max for color mapping
global_min = min(all_delivery_times)
global_max = max(all_delivery_times)
# Create a 3x3 grid of subplots (sufficient for 7 cities)
fig, axes = plt.subplots(3, 3, figsize=(20, 12))
axes = axes.flatten()
# Loop through cities and plot
for i, (city, coords) in enumerate(cities_coords.items()):
ax = axes[i] # Get subplot grid position
city_data = filter_city_data(coords)
# Use all data for the city
if len(city_data) > 0: # Ensure there's data to plot
# Create a scatterplot showing delivery times with inverted Viridis colormap
scatter = sns.scatterplot(
x=city_data['c_geolocation_lng'],
y=city_data['c_geolocation_lat'],
hue=city_data['delivery_time_days'], # Use delivery time for color
palette='viridis_r', # Inverted Viridis for delivery time (lower is better)
size=city_data['total_payment_value'], # Size points by payment value
sizes=(20, 200), # Adjust sizes to improve visibility
ax=ax,
legend=False # Disable legend for individual plots
)
ax.set_title(f"Delivery Performance in {city}")
ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')
# Remove the top, left, and right spines
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
# Create a colorbar based on the delivery times
sm = plt.cm.ScalarMappable(cmap='viridis_r', norm=plt.Normalize(global_min, global_max))
sm.set_array([]) # Only needed for older versions of matplotlib
cbar = plt.colorbar(sm, ax=axes, orientation='horizontal', fraction=0.02, pad=0.1)
cbar.set_label('Delivery Time (Days)')
# Remove any unused subplots
for j in range(i + 1, len(axes)):
fig.delaxes(axes[j])
# Adjust layout and add a main title
plt.subplots_adjust(top=0.92, hspace=0.4) # Adjust space between subplots
plt.suptitle('Delivery Performance in Major Brazilian Cities, Dot Size by Customer Payment Value', fontsize=16, y=0.98)
plt.show()
Distance Efficiency (distance in km per delivery time in days)¶
---> higher values indicate faster deliveries across distances
# Analyzing distance efficiency based on customer location is important because:
#
# 1. Operational Optimization: Distance efficiency reveals how effectively routes are being utilized
# relative to delivery times. It helps identify inefficiencies in delivery networks and provides
# insights into how to optimize routes and resources for improved service.
#
# 2. Geographic Challenges: Distance efficiency can highlight areas with challenging infrastructure
# or topography that may cause inefficiencies. Understanding these factors helps adjust strategies
# for specific regions, such as choosing better-suited transportation methods or adding local warehouses.
#
# 3. Cost Efficiency: Tracking distance efficiency helps identify where excessive fuel or resources
# may be used due to poor routing or geographic challenges, leading to potential cost reductions in
# logistics and fuel consumption.
# @title Delivery Performance in Major Brazilian Cities (Dot Size by Customer Payment Value)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from geopy.distance import geodesic
import numpy as np
# Define cities coordinates (latitude, longitude)
cities_coords = {
'Rio de Janeiro': (-22.9068, -43.1729),
'Sao Paulo': (-23.5505, -46.6333),
'Porto Alegre': (-30.0346, -51.2177),
'Curitiba': (-25.4284, -49.2733),
'Fortaleza': (-3.7172, -38.5434),
'Bahia': (-12.9714, -38.5014),
'Brasilia': (-15.7942, -47.8822)
}
# Haversine distance function to filter data for a specific city within a radius
def filter_city_data(city_coords, radius_km=50):
lat, lng = city_coords
city_data = orders_cleaned[
orders_cleaned.apply(
lambda row: geodesic((row['c_geolocation_lat'], row['c_geolocation_lng']), (lat, lng)).km <= radius_km,
axis=1
)
].copy() # Use .copy() to avoid SettingWithCopyWarning
return city_data
# Calculate global min and max distance efficiency across all cities for consistent color mapping
all_distance_efficiency = []
for coords in cities_coords.values():
city_data = filter_city_data(coords)
all_distance_efficiency.extend(city_data['distance_efficiency'].dropna())
# Get the global min and max for color mapping
global_min = min(all_distance_efficiency)
global_max = max(all_distance_efficiency)
# Create a 3x3 grid of subplots (sufficient for 7 cities)
fig, axes = plt.subplots(3, 3, figsize=(20, 12))
axes = axes.flatten()
# Loop through cities and plot
for i, (city, coords) in enumerate(cities_coords.items()):
ax = axes[i] # Get subplot grid position
city_data = filter_city_data(coords)
# Use all data for the city
if len(city_data) > 0: # Ensure there's data to plot
# Create a scatterplot showing distance efficiency with Viridis colormap
scatter = sns.scatterplot(
x=city_data['c_geolocation_lng'],
y=city_data['c_geolocation_lat'],
hue=city_data['distance_efficiency'], # Use distance efficiency for color
palette='viridis', # Viridis colormap (not inverted)
size=city_data['total_payment_value'], # Size points by payment value
sizes=(20, 200), # Adjust sizes to improve visibility
ax=ax,
legend=False # Disable legend for individual plots
)
ax.set_title(f"Delivery Performance in {city}")
ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')
# Remove the top, left, and right spines
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
# Create a colorbar based on the distance efficiency
sm = plt.cm.ScalarMappable(cmap='viridis', norm=plt.Normalize(global_min, global_max))
sm.set_array([]) # Only needed for older versions of matplotlib
cbar = plt.colorbar(sm, ax=axes, orientation='horizontal', fraction=0.02, pad=0.1)
cbar.set_label('Distance Efficiency')
# Remove any unused subplots
for j in range(i + 1, len(axes)):
fig.delaxes(axes[j])
# Adjust layout and add a main title
plt.subplots_adjust(top=0.92, hspace=0.4) # Adjust space between subplots
plt.suptitle('Delivery Performance in Major Brazilian Cities\nDot Size by Customer Payment Value', fontsize=16, y=0.98)
plt.show()
Customer reviews¶
# Analyzing customer review scores based on location provides critical insights into the overall service quality:
#
# 1. Service Quality by Region: Customer satisfaction can vary across different regions due to factors such
# as delivery speed, product availability, or local preferences. Identifying areas with lower review scores
# helps target improvements in those regions.
#
# 2. Logistics Impact: Geographical issues, such as infrastructure or delivery delays, can negatively
# impact customer experiences and reviews. Monitoring scores regionally helps identify if logistics
# challenges are influencing customer satisfaction.
#
# 3. Targeted Improvements: By understanding the relationship between location and review scores,
# companies can implement targeted actions, such as improving last-mile delivery services, offering
# local support, or addressing specific regional complaints to boost customer experience.
# @title Customer Reviews in Major Brazilian Cities (Dot Size by Total Payment Value)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from geopy.distance import geodesic
import numpy as np
# Define cities coordinates (latitude, longitude)
cities_coords = {
'Rio de Janeiro': (-22.9068, -43.1729),
'Sao Paulo': (-23.5505, -46.6333),
'Porto Alegre': (-30.0346, -51.2177),
'Curitiba': (-25.4284, -49.2733),
'Fortaleza': (-3.7172, -38.5434),
'Bahia': (-12.9714, -38.5014),
'Brasilia': (-15.7942, -47.8822)
}
# Haversine distance function to filter data for a specific city within a radius
def filter_city_data(city_coords, radius_km=50):
lat, lng = city_coords
city_data = orders_cleaned[
orders_cleaned.apply(
lambda row: geodesic((row['c_geolocation_lat'], row['c_geolocation_lng']), (lat, lng)).km <= radius_km,
axis=1
)
].copy() # Use .copy() to avoid SettingWithCopyWarning
return city_data
# Calculate global min and max review scores across all cities for consistent color mapping
all_review_scores = []
for coords in cities_coords.values():
city_data = filter_city_data(coords)
all_review_scores.extend(city_data['review_score'].dropna())
# Get the global min and max for color mapping
global_min = min(all_review_scores)
global_max = max(all_review_scores)
# Create a 3x3 grid of subplots (sufficient for 7 cities)
fig, axes = plt.subplots(3, 3, figsize=(20, 12))
axes = axes.flatten()
# Loop through cities and plot
for i, (city, coords) in enumerate(cities_coords.items()):
ax = axes[i] # Get subplot grid position
city_data = filter_city_data(coords)
# Use all data for the city
if len(city_data) > 0: # Ensure there's data to plot
# Create a scatterplot showing customer review scores with Viridis colormap
scatter = sns.scatterplot(
x=city_data['c_geolocation_lng'],
y=city_data['c_geolocation_lat'],
hue=city_data['review_score'], # Use review scores for color
palette='viridis', # Viridis colormap (not inverted)
size=city_data['total_payment_value'], # Size points by payment value
sizes=(20, 200), # Adjust sizes to improve visibility
ax=ax,
legend=False # Disable legend for individual plots
)
ax.set_title(f"Customer Review Scores in {city}")
ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')
# Remove the top, left, and right spines
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
# Create a colorbar based on the review scores
sm = plt.cm.ScalarMappable(cmap='viridis', norm=plt.Normalize(global_min, global_max))
sm.set_array([]) # Only needed for older versions of matplotlib
cbar = plt.colorbar(sm, ax=axes, orientation='horizontal', fraction=0.02, pad=0.1)
cbar.set_label('Review Score')
# Remove any unused subplots
for j in range(i + 1, len(axes)):
fig.delaxes(axes[j])
# Adjust layout and add a main title
plt.subplots_adjust(top=0.92, hspace=0.4) # Adjust space between subplots
plt.suptitle('Customer Reviews in Major Brazilian Cities, Dot Size by Total Payment Value', fontsize=16, y=0.98)
plt.show()
Time To Estimate Delivery Days¶
# Analyzing "Time to Estimate Delivery Days" based on customer location is important because:
#
# 1. Accuracy in Delivery Predictions: Understanding how well delivery estimates align with actual delivery
# times across different regions allows businesses to improve their prediction models. This ensures customers
# have more accurate expectations for their orders.
#
# 2. Regional Variations: Certain cities or regions may consistently show longer or shorter time estimates due to
# local conditions, infrastructure, or logistical challenges. Identifying these patterns helps adjust
# estimations to be region-specific.
#
# 3. Customer Satisfaction: Providing more accurate delivery time estimates leads to higher customer satisfaction.
# Misalignments between estimated and actual delivery times can result in dissatisfaction, complaints, or negative reviews.
# @title Time To Estimate Delivery Days in Major Brazilian Cities (Dot Size by Total Payment Value)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from geopy.distance import geodesic
import numpy as np
# Define cities coordinates (latitude, longitude)
cities_coords = {
'Rio de Janeiro': (-22.9068, -43.1729),
'Sao Paulo': (-23.5505, -46.6333),
'Porto Alegre': (-30.0346, -51.2177),
'Curitiba': (-25.4284, -49.2733),
'Fortaleza': (-3.7172, -38.5434),
'Bahia': (-12.9714, -38.5014),
'Brasilia': (-15.7942, -47.8822)
}
# Haversine distance function to filter data for a specific city within a radius
def filter_city_data(city_coords, radius_km=50):
lat, lng = city_coords
city_data = orders_cleaned[
orders_cleaned.apply(
lambda row: geodesic((row['c_geolocation_lat'], row['c_geolocation_lng']), (lat, lng)).km <= radius_km,
axis=1
)
].copy() # Use .copy() to avoid SettingWithCopyWarning
return city_data
# Calculate global min and max time to estimate delivery days across all cities for consistent color mapping
all_time_to_estimate = []
for coords in cities_coords.values():
city_data = filter_city_data(coords)
all_time_to_estimate.extend(city_data['time_to_estimate_delivery_days'].dropna())
# Get the global min and max for color mapping
global_min = min(all_time_to_estimate)
global_max = max(all_time_to_estimate)
# Create a 3x3 grid of subplots (sufficient for 7 cities)
fig, axes = plt.subplots(3, 3, figsize=(20, 12))
axes = axes.flatten()
# Loop through cities and plot
for i, (city, coords) in enumerate(cities_coords.items()):
ax = axes[i] # Get subplot grid position
city_data = filter_city_data(coords)
# Use all data for the city
if len(city_data) > 0: # Ensure there's data to plot
# Create a scatterplot showing time to estimate delivery days with inverted Viridis colormap
scatter = sns.scatterplot(
x=city_data['c_geolocation_lng'],
y=city_data['c_geolocation_lat'],
hue=city_data['time_to_estimate_delivery_days'], # Use delivery time for color
palette='viridis_r', # Inverted Viridis colormap (lower times are better)
size=city_data['total_payment_value'], # Size points by payment value
sizes=(20, 200), # Adjust sizes to improve visibility
ax=ax,
legend=False # Disable legend for individual plots
)
ax.set_title(f"Time To Estimate Delivery Days in {city}")
ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')
# Remove the top, left, and right spines
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
# Create a colorbar based on time to estimate delivery days
sm = plt.cm.ScalarMappable(cmap='viridis_r', norm=plt.Normalize(global_min, global_max))
sm.set_array([]) # Only needed for older versions of matplotlib
cbar = plt.colorbar(sm, ax=axes, orientation='horizontal', fraction=0.02, pad=0.1)
cbar.set_label('Time To Estimate Delivery Days')
# Remove any unused subplots
for j in range(i + 1, len(axes)):
fig.delaxes(axes[j])
# Adjust layout and add a main title
plt.subplots_adjust(top=0.92, hspace=0.4) # Adjust space between subplots
plt.suptitle('Time To Estimate Delivery Days in Major Brazilian Cities, Dot Size by Total Payment Value', fontsize=16, y=0.98)
plt.show()
Sellers Geolocation¶
Sellers Geolocation¶
# Plotting delivery time days based on seller location is essential for understanding the geographical
# impact of delivery performance:
#
# 1. Seller-Based Logistics Insights: By visualizing delivery times relative to the seller's location,
# companies can assess the efficiency of their sellers. This enables identification of specific sellers
# or regions with longer delivery times, allowing for targeted improvements in supply chain or logistics
# management.
#
# 2. Operational Bottlenecks: Regions where sellers consistently have higher delivery times may indicate
# logistical bottlenecks, such as poor infrastructure or longer routes. Addressing these issues can reduce
# delivery delays and improve customer satisfaction.
#
# 3. Seller Performance Monitoring: Sellers with higher volumes of orders may have different delivery
# time patterns compared to those handling fewer orders. Monitoring the relationship between delivery
# times and the number of orders allows businesses to support high-volume sellers with better logistical
# planning and resources.
Delivery time days¶
# @title Delivery Performance in Major Brazilian Cities (Dot Size by Seller Number of Orders)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from geopy.distance import geodesic
import numpy as np
# Define cities coordinates (latitude, longitude)
cities_coords = {
'Rio de Janeiro': (-22.9068, -43.1729),
'Sao Paulo': (-23.5505, -46.6333),
'Porto Alegre': (-30.0346, -51.2177),
'Curitiba': (-25.4284, -49.2733),
# 'Fortaleza': (-3.7172, -38.5434), # Commented out for layout space
'Bahia': (-12.9714, -38.5014),
'Brasilia': (-15.7942, -47.8822)
}
# Haversine distance function to filter data for a specific city within a radius
def filter_city_data(city_coords, radius_km=50):
lat, lng = city_coords
city_data = sellers_grouped[
sellers_grouped.apply(
lambda row: geodesic((row['s_geolocation_lat'], row['s_geolocation_lng']), (lat, lng)).km <= radius_km,
axis=1
)
].copy() # Use .copy() to avoid SettingWithCopyWarning
return city_data
# Create a 3x3 grid of subplots (sufficient for 7 cities)
fig, axes = plt.subplots(3, 3, figsize=(20, 12))
axes = axes.flatten() # Flatten the axes array for easier indexing
# Loop through cities and plot
for i, (city, coords) in enumerate(cities_coords.items()):
ax = axes[i] # Get subplot grid position
city_data = filter_city_data(coords)
# Calculate delivery delays
city_data['delivery_delay'] = city_data['delivery_time_days'] - city_data['time_to_estimate_delivery_days']
# Create a scatterplot showing delivery delays with inverted Viridis colormap
scatter = sns.scatterplot(
x=city_data['s_geolocation_lng'],
y=city_data['s_geolocation_lat'],
hue=city_data['delivery_time_days'], # Use delivery time for color
palette='viridis_r', # Inverted Viridis (lower delivery times are better)
size=city_data['num_orders'], # Size of points based on number of orders
sizes=(20, 200), # Adjust sizes to improve visibility
ax=ax,
legend=False # Disable individual legend for the scatter plot
)
ax.set_title(f"Delivery Performance in {city}")
ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')
# Remove the top, left, and right spines
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
# Create a colorbar for delivery_time_days
norm = plt.Normalize(vmin=sellers_grouped['delivery_time_days'].min(), vmax=sellers_grouped['delivery_time_days'].max())
sm = plt.cm.ScalarMappable(cmap='viridis_r', norm=norm) # Use inverted Viridis colormap
sm.set_array([]) # Only needed for older versions of matplotlib
cbar = plt.colorbar(sm, ax=axes, orientation='horizontal', fraction=0.02, pad=0.1)
cbar.set_label('Delivery Time Days')
# Remove any unused subplots (if there are fewer than 9)
for j in range(i + 1, len(axes)):
fig.delaxes(axes[j])
# Adjust layout and add a main title
plt.tight_layout()
plt.subplots_adjust(top=0.9) # Adjust top to give space for the main title
plt.suptitle('Delivery Performance in Major Brazilian Cities, Dot Size by Seller Number of Orders', fontsize=16, y=1.02)
plt.show()
Distance Efficiency¶
# Plotting Distance Efficiency based on seller locations provides key insights into how well
# sellers are optimizing their deliveries over various distances:
#
# 1. Logistical Optimization: This visualization helps identify which sellers or regions are most
# efficient in managing the distance between their location and customers. Sellers with high
# distance efficiency are likely optimizing their logistics and transportation.
#
# 2. Regional Efficiency Patterns: By comparing cities, we can determine if certain cities consistently
# have more efficient sellers, possibly due to better infrastructure, transport networks, or delivery hubs.
#
# 3. High Volume vs. Efficiency Trade-off: By correlating the number of orders with distance efficiency,
# we can assess if high-volume sellers are sacrificing efficiency for order fulfillment, or if they manage
# to maintain both high order volumes and efficient delivery practices.
# @title Distance Efficiency in Major Brazilian Cities (Dot Size by Seller Number of Orders)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from geopy.distance import geodesic
import numpy as np
# Define cities coordinates (latitude, longitude)
cities_coords = {
'Rio de Janeiro': (-22.9068, -43.1729),
'Sao Paulo': (-23.5505, -46.6333),
'Porto Alegre': (-30.0346, -51.2177),
'Curitiba': (-25.4284, -49.2733),
# 'Fortaleza': (-3.7172, -38.5434), # Commented out for layout space
'Bahia': (-12.9714, -38.5014),
'Brasilia': (-15.7942, -47.8822)
}
# Haversine distance function to filter data for a specific city within a radius
def filter_city_data(city_coords, radius_km=50):
lat, lng = city_coords
city_data = sellers_grouped[
sellers_grouped.apply(
lambda row: geodesic((row['s_geolocation_lat'], row['s_geolocation_lng']), (lat, lng)).km <= radius_km,
axis=1
)
].copy() # Use .copy() to avoid SettingWithCopyWarning
return city_data
# Create a 3x3 grid of subplots (sufficient for 7 cities)
fig, axes = plt.subplots(3, 3, figsize=(20, 12))
axes = axes.flatten() # Flatten the axes array for easier indexing
# Loop through cities and plot
for i, (city, coords) in enumerate(cities_coords.items()):
ax = axes[i] # Get subplot grid position
city_data = filter_city_data(coords)
# Create a scatterplot showing distance efficiency with Viridis colormap
scatter = sns.scatterplot(
x=city_data['s_geolocation_lng'],
y=city_data['s_geolocation_lat'],
hue=city_data['distance_efficiency'], # Use distance efficiency for color
palette='viridis', # Viridis colormap (not inverted, assuming higher efficiency is better)
size=city_data['num_orders'], # Size of points based on number of orders
sizes=(20, 200), # Adjust sizes to improve visibility
ax=ax,
legend=False # Disable individual legend for the scatter plot
)
ax.set_title(f"Distance Efficiency in {city}")
ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')
# Remove the top, left, and right spines
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
# Create a colorbar for distance efficiency
norm = plt.Normalize(vmin=sellers_grouped['distance_efficiency'].min(), vmax=sellers_grouped['distance_efficiency'].max())
sm = plt.cm.ScalarMappable(cmap='viridis', norm=norm) # Use Viridis colormap
sm.set_array([]) # Only needed for older versions of matplotlib
cbar = plt.colorbar(sm, ax=axes, orientation='horizontal', fraction=0.02, pad=0.1)
cbar.set_label('Distance Efficiency')
# Remove any unused subplots (if there are fewer than 9)
for j in range(i + 1, len(axes)):
fig.delaxes(axes[j])
# Adjust layout and add a main title
plt.tight_layout()
plt.subplots_adjust(top=0.9) # Adjust top to give space for the main title
plt.suptitle('Distance Efficiency in Major Brazilian Cities, Dot Size by Seller Number of Orders', fontsize=16, y=1.02)
plt.show()
Freight share¶
# Who pays more for transportation?
# We might find a freight ratio by dividing the freight value by the order value.
# This ratio indicates the percentage of the product price that a person had to pay just to get their order delivered.
# For example, if a product costs R$50.00 and the freight value was R$10.00, then the freight ratio is 0.2 or 20%.
# Higher freight ratios are very likely to discourage customers to complete a purchase.
# Due to logistics costs, we expect to see lower freight ratios in densely populated areas and are higher freight ratios on sparsely poulated regions.
# Plotting Freight Share based on seller locations is essential for understanding the cost dynamics
# of logistics in various regions:
#
# 1. Cost Management: This visualization reveals how much of the total order cost is attributed to
# freight expenses for different sellers. It allows businesses to identify areas where costs can be
# optimized, leading to improved profitability.
#
# 2. Performance Benchmarking: By comparing freight shares across cities and sellers, organizations can
# benchmark performance. High freight share in certain locations might indicate inefficiencies or higher
# logistical challenges that need to be addressed.
#
# 3. Strategic Decision-Making: Understanding freight share helps in making informed decisions about
# supplier selection, pricing strategies, and resource allocation. It can guide companies on where to
# invest in infrastructure improvements or adjust pricing models to better compete.
#
# 4. Customer Experience: High freight costs can impact overall pricing and delivery times. By analyzing
# freight share, businesses can assess how logistics affect customer satisfaction and make necessary
# adjustments to meet customer expectations effectively.
# @title Freight Share in Major Brazilian Cities (Dot Size by Seller Number of Orders)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from geopy.distance import geodesic
import numpy as np
# Define cities coordinates (latitude, longitude)
cities_coords = {
'Rio de Janeiro': (-22.9068, -43.1729),
'Sao Paulo': (-23.5505, -46.6333),
'Porto Alegre': (-30.0346, -51.2177),
'Curitiba': (-25.4284, -49.2733),
# 'Fortaleza': (-3.7172, -38.5434), # Commented out for layout space
'Bahia': (-12.9714, -38.5014),
'Brasilia': (-15.7942, -47.8822)
}
# Haversine distance function to filter data for a specific city within a radius
def filter_city_data(city_coords, radius_km=50):
lat, lng = city_coords
city_data = sellers_grouped[
sellers_grouped.apply(
lambda row: geodesic((row['s_geolocation_lat'], row['s_geolocation_lng']), (lat, lng)).km <= radius_km,
axis=1
)
].copy() # Use .copy() to avoid SettingWithCopyWarning
return city_data
# Create a 3x3 grid of subplots (sufficient for 7 cities)
fig, axes = plt.subplots(3, 3, figsize=(20, 12))
axes = axes.flatten() # Flatten the axes array for easier indexing
# Loop through cities and plot
for i, (city, coords) in enumerate(cities_coords.items()):
ax = axes[i] # Get subplot grid position
city_data = filter_city_data(coords)
# Create a scatterplot showing freight share with inverted Viridis colormap
scatter = sns.scatterplot(
x=city_data['s_geolocation_lng'],
y=city_data['s_geolocation_lat'],
hue=city_data['freight_share'], # Use freight share for color
palette='viridis_r', # Inverted Viridis (higher freight share is less favorable)
size=city_data['num_orders'], # Size of points based on number of orders
sizes=(20, 200), # Adjust sizes to improve visibility
ax=ax,
legend=False # Disable individual legend for the scatter plot
)
ax.set_title(f"Freight Share in {city}")
ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')
# Remove the top, left, and right spines
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
# Create a colorbar for freight share
norm = plt.Normalize(vmin=sellers_grouped['freight_share'].min(), vmax=sellers_grouped['freight_share'].max())
sm = plt.cm.ScalarMappable(cmap='viridis_r', norm=norm) # Use inverted Viridis colormap
sm.set_array([]) # Only needed for older versions of matplotlib
cbar = plt.colorbar(sm, ax=axes, orientation='horizontal', fraction=0.02, pad=0.1)
cbar.set_label('Freight Share')
# Remove any unused subplots (if there are fewer than 9)
for j in range(i + 1, len(axes)):
fig.delaxes(axes[j])
# Adjust layout and add a main title
plt.tight_layout()
plt.subplots_adjust(top=0.9) # Adjust top to give space for the main title
plt.suptitle('Freight Share in Major Brazilian Cities, Dot Size by Seller Number of Orders', fontsize=16, y=1.02)
plt.show()
Time To Estimate Delivery Days¶
###
# Plotting the "time to estimate delivery days" based on seller location and order size is important for several reasons:
#
# 1. Identifying Regional Delivery Performance: By visualizing delivery times geographically, businesses can easily identify areas
# where sellers may face longer delivery delays, enabling targeted interventions to improve logistics and customer service in those
# regions.
#
# 2. Assessing Impact of Order Volume: The size of the points on the plot indicates the number of orders each seller has filled.
# This allows businesses to analyze whether high-order-volume sellers are able to maintain timely deliveries, highlighting potential
# capacity issues that could impact customer satisfaction.
#
# 3. Strategic Resource Allocation: Understanding the relationship between seller location, delivery times, and order size enables
# businesses to make informed decisions on resource allocation, such as optimizing logistics support or enhancing seller performance
# in specific areas, ultimately driving operational efficiency and customer satisfaction.
###
# @title Time to Estimate Delivery Days in Major Brazilian Cities (Dot Size by Seller Number of Orders)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from geopy.distance import geodesic
import numpy as np
# Define columns to plot
cols = [
'delivery_time_days', 'order_processing_time_days', 'carrier_delivery_time_days',
'customer_delivery_time_days', 'time_to_estimate_delivery_days',
'distance_km', 'distance_efficiency', 'freight_cost_per_km', 'price_to_distance',
'cost_efficiency', 'order_size_per_km', 'total_order_cost_per_km', 'freight_share',
's_geolocation_lat', 's_geolocation_lng'
]
# Group orders by seller and calculate the mean of specified columns
sellers_grouped = orders_cleaned.groupby('seller_id')[cols].mean().reset_index()
# Count the number of orders each seller has filled
order_count = orders_cleaned.groupby('seller_id').size().reset_index(name='num_orders')
sellers_grouped = sellers_grouped.merge(order_count, on='seller_id')
# Define cities coordinates (latitude, longitude)
cities_coords = {
'Rio de Janeiro': (-22.9068, -43.1729),
'Sao Paulo': (-23.5505, -46.6333),
'Porto Alegre': (-30.0346, -51.2177),
'Curitiba': (-25.4284, -49.2733),
# 'Fortaleza': (-3.7172, -38.5434), # Commented out for layout space
'Bahia': (-12.9714, -38.5014),
'Brasilia': (-15.7942, -47.8822)
}
# Haversine distance function to filter data for a specific city within a radius
def filter_city_data(city_coords, radius_km=50):
lat, lng = city_coords
city_data = sellers_grouped[
sellers_grouped.apply(
lambda row: geodesic((row['s_geolocation_lat'], row['s_geolocation_lng']), (lat, lng)).km <= radius_km,
axis=1
)
].copy() # Use .copy() to avoid SettingWithCopyWarning
return city_data
# Create a 3x3 grid of subplots (sufficient for 7 cities)
fig, axes = plt.subplots(3, 3, figsize=(20, 12))
axes = axes.flatten() # Flatten the axes array for easier indexing
# Loop through cities and plot
for i, (city, coords) in enumerate(cities_coords.items()):
ax = axes[i] # Get subplot grid position
city_data = filter_city_data(coords)
# Create a scatterplot showing time to estimate delivery days with inverted Viridis colormap
scatter = sns.scatterplot(
x=city_data['s_geolocation_lng'],
y=city_data['s_geolocation_lat'],
hue=city_data['time_to_estimate_delivery_days'], # Use time to estimate delivery days for color
palette='viridis_r', # Inverted Viridis (lower times are better)
size=city_data['num_orders'], # Size of points based on number of orders
sizes=(20, 200), # Adjust sizes to improve visibility
ax=ax,
legend=False # Disable individual legend for the scatter plot
)
ax.set_title(f"Time to Estimate Delivery Days in {city}")
ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')
# Remove the top, left, and right spines
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
# Create a colorbar for time to estimate delivery days
norm = plt.Normalize(vmin=sellers_grouped['time_to_estimate_delivery_days'].min(), vmax=sellers_grouped['time_to_estimate_delivery_days'].max())
sm = plt.cm.ScalarMappable(cmap='viridis_r', norm=norm)
sm.set_array([]) # Only needed for older versions of matplotlib
cbar = plt.colorbar(sm, ax=axes, orientation='horizontal', fraction=0.02, pad=0.1)
cbar.set_label('Time to Estimate Delivery Days')
# Remove any unused subplots (if there are fewer than 9)
for j in range(i + 1, len(axes)):
fig.delaxes(axes[j])
# Adjust layout and add a main title
plt.tight_layout()
plt.subplots_adjust(top=0.9) # Adjust top to give space for the main title
plt.suptitle('Time to Estimate Delivery Days in Major Brazilian Cities, Dot Size by Seller Number of Orders', fontsize=16, y=1.02)
plt.show()
Total order cost per km¶
###
# Plotting the total order cost per km based on seller location and order size is important for several reasons:
#
# 1. Identifying High-Cost Areas: By visualizing total order costs geographically, businesses can pinpoint regions where
# shipping costs are disproportionately high. This information is critical for optimizing delivery routes and reducing
# transportation expenses in those specific areas.
#
# 2. Assessing Seller Performance: The size of the points in the plot indicates the number of orders each seller has filled.
# Analyzing how total order cost per km correlates with order volume allows businesses to evaluate whether high-volume
# sellers are maintaining cost efficiency. This can inform decisions about supporting sellers that may be struggling with
# high logistics costs.
#
# 3. Strategic Pricing Adjustments: Understanding the geographic distribution of order costs helps businesses make
# targeted pricing decisions. For instance, sellers operating in high-cost areas may need to adjust their pricing models to
# remain competitive, while also considering promotional strategies to attract customers in those regions.
###
# @title Total Order Cost per km in Major Brazilian Cities (Dot Size by Seller Number of Orders)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from geopy.distance import geodesic
import numpy as np
# Define columns to plot
cols = [
'delivery_time_days', 'order_processing_time_days', 'carrier_delivery_time_days',
'customer_delivery_time_days', 'time_to_estimate_delivery_days',
'distance_km', 'distance_efficiency', 'freight_cost_per_km', 'price_to_distance',
'cost_efficiency', 'order_size_per_km', 'total_order_cost_per_km', 'freight_share',
's_geolocation_lat', 's_geolocation_lng'
]
# Group orders by seller and calculate the mean of specified columns
sellers_grouped = orders_cleaned.groupby('seller_id')[cols].mean().reset_index()
# Count the number of orders each seller has filled
order_count = orders_cleaned.groupby('seller_id').size().reset_index(name='num_orders')
sellers_grouped = sellers_grouped.merge(order_count, on='seller_id')
# Define cities coordinates (latitude, longitude)
cities_coords = {
'Rio de Janeiro': (-22.9068, -43.1729),
'Sao Paulo': (-23.5505, -46.6333),
'Porto Alegre': (-30.0346, -51.2177),
'Curitiba': (-25.4284, -49.2733),
# 'Fortaleza': (-3.7172, -38.5434), # Commented out for layout space
'Bahia': (-12.9714, -38.5014),
'Brasilia': (-15.7942, -47.8822)
}
# Haversine distance function to filter data for a specific city within a radius
def filter_city_data(city_coords, radius_km=50):
lat, lng = city_coords
city_data = sellers_grouped[
sellers_grouped.apply(
lambda row: geodesic((row['s_geolocation_lat'], row['s_geolocation_lng']), (lat, lng)).km <= radius_km,
axis=1
)
].copy() # Use .copy() to avoid SettingWithCopyWarning
return city_data
# Create a 3x3 grid of subplots (sufficient for 7 cities)
fig, axes = plt.subplots(3, 3, figsize=(20, 12))
axes = axes.flatten() # Flatten the axes array for easier indexing
# Loop through cities and plot
for i, (city, coords) in enumerate(cities_coords.items()):
ax = axes[i] # Get subplot grid position
city_data = filter_city_data(coords)
# Create a scatterplot showing total order cost per km with inverted Viridis colormap
scatter = sns.scatterplot(
x=city_data['s_geolocation_lng'],
y=city_data['s_geolocation_lat'],
hue=city_data['total_order_cost_per_km'], # Use total order cost per km for color
palette='viridis_r', # Inverted Viridis if lower cost per km is better
size=city_data['num_orders'], # Size of points based on number of orders
sizes=(20, 200), # Adjust sizes to improve visibility
ax=ax,
legend=False # Disable individual legend for the scatter plot
)
ax.set_title(f"Total Order Cost per km in {city}")
ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')
# Remove the top, left, and right spines
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
# Create a colorbar for total order cost per km
norm = plt.Normalize(vmin=sellers_grouped['total_order_cost_per_km'].min(), vmax=sellers_grouped['total_order_cost_per_km'].max())
sm = plt.cm.ScalarMappable(cmap='viridis_r', norm=norm)
sm.set_array([]) # Only needed for older versions of matplotlib
cbar = plt.colorbar(sm, ax=axes, orientation='horizontal', fraction=0.02, pad=0.1)
cbar.set_label('Total Order Cost per km')
# Remove any unused subplots (if there are fewer than 9)
for j in range(i + 1, len(axes)):
fig.delaxes(axes[j])
# Adjust layout and add a main title
plt.tight_layout()
plt.subplots_adjust(top=0.9) # Adjust top to give space for the main title
plt.suptitle('Total Order Cost per km in Major Brazilian Cities, Dot Size by Seller Number of Orders', fontsize=16, y=1.02)
plt.show()
Largest sellers¶
# Metrics we want to analyse the largest sellers by:
# 1) delivery time days
# 2) late deliveries
# 3) freight cost
# 4) product weight
# 5) total payment value
# 6) cost efficiency
# 7) customer reviews
1) Delivery time (in days)¶
# @title Delivery Performance of 10 Largest Sellers (Dot Size by Customer Payment Value)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.colors import Normalize
from matplotlib import colormaps
# Get the 10 largest sellers
largest_seller_ids = orders_cleaned['seller_id'].value_counts().nlargest(10).index.tolist()
# Create a 3x4 grid of subplots (sufficient for 10 largest sellers)
fig, axes = plt.subplots(3, 4, figsize=(24, 12))
axes = axes.flatten() # Flatten the axes array for easier indexing
# Get global min and max for delivery time in days for consistent color mapping
global_vmin = orders_cleaned['delivery_time_days'].min()
global_vmax = orders_cleaned['delivery_time_days'].max()
# Loop through the 10 largest sellers and plot
for i, seller_id in enumerate(largest_seller_ids):
# Filter data for the current seller
seller_data = orders_cleaned[orders_cleaned['seller_id'] == seller_id].copy()
# Check if there's data for the seller
if not seller_data.empty:
# Define custom colormap from viridis, inverted
cmap = colormaps['viridis_r']
# Normalize based on global min and max delivery time (not per seller)
norm = Normalize(vmin=global_vmin, vmax=global_vmax)
# Create a scatterplot of customer locations based on delivery time
scatter = sns.scatterplot(
x=seller_data['c_geolocation_lng'],
y=seller_data['c_geolocation_lat'],
hue=seller_data['delivery_time_days'], # Use delivery_time_days as color metric
palette=cmap, # Custom colormap
size=seller_data['total_payment_value'], # Size based on total_payment_value
sizes=(20, 200), # Adjust min/max size range of customer dots
ax=axes[i], # Use the corresponding subplot
legend=False, # No legend for individual plots
hue_norm=norm # Normalize hue for consistent mapping
)
# Plot the seller's location as a black dot with a smaller size
axes[i].scatter(
seller_data['s_geolocation_lng'].mean(),
seller_data['s_geolocation_lat'].mean(),
color='black',
s=150, # Size for seller's location
label='Seller Location'
)
# Add subtitle for each plot based on seller ranking
titles = [
"Largest Seller", "2nd Largest Seller", "3rd Largest Seller",
"4th Largest Seller", "5th Largest Seller", "6th Largest Seller",
"7th Largest Seller", "8th Largest Seller", "9th Largest Seller", "10th Largest Seller"
]
axes[i].set_title(titles[i], fontsize=14, pad=20)
# Set labels for the axes and adjust padding
axes[i].set_xlabel('Longitude', labelpad=20)
axes[i].set_ylabel('Latitude', labelpad=20)
# Remove the top, left, and right spines
axes[i].spines['top'].set_visible(False)
axes[i].spines['left'].set_visible(False)
axes[i].spines['right'].set_visible(False)
else:
# Hide the empty subplot
axes[i].set_visible(False)
# Create a global colorbar for delivery time days
cmap = colormaps['viridis_r']
norm = Normalize(vmin=global_vmin, vmax=global_vmax) # Global normalization
cbar = plt.colorbar(
plt.cm.ScalarMappable(norm=norm, cmap=cmap),
ax=axes,
orientation='horizontal',
fraction=0.05,
pad=0.12
)
cbar.set_label('Delivery Time (days)', fontsize=12)
cbar.ax.tick_params(labelsize=10)
# Delete any unused subplots (if there are fewer than 12)
for j in range(len(largest_seller_ids), len(axes)):
fig.delaxes(axes[j])
# Adjust layout and add a main title
plt.subplots_adjust(
top=0.85,
bottom=0.18,
hspace=0.6,
wspace=0.35
)
plt.suptitle('Delivery Performance of 10 Largest Sellers, Dot Size by Customer Payment Value', fontsize=16)
# Show the plot
plt.show()
2) Delivery delay (days)¶
# @title Time to Estimate Delivery Days of 10 Largest Sellers (Dot Size by Customer Payment Value)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.colors import Normalize
from matplotlib import colormaps
# Get the 10 largest sellers
largest_seller_ids = orders_cleaned['seller_id'].value_counts().nlargest(10).index.tolist()
# Create a 3x4 grid of subplots (sufficient for 10 largest sellers)
fig, axes = plt.subplots(3, 4, figsize=(24, 12)) # Adjusted to 3 rows and 4 columns, larger figure size
axes = axes.flatten() # Flatten the axes array for easier indexing
# Get global min and max for time to estimate delivery days for consistent color mapping
global_vmin = orders_cleaned['time_to_estimate_delivery_days'].min()
global_vmax = orders_cleaned['time_to_estimate_delivery_days'].max()
# Loop through the 10 largest sellers and plot
for i, seller_id in enumerate(largest_seller_ids):
# Filter data for the current seller
seller_data = orders_cleaned[orders_cleaned['seller_id'] == seller_id].copy()
# Check if there's data for the seller
if not seller_data.empty:
# Define custom colormap from viridis, inverted
cmap = colormaps['viridis_r']
# Normalize based on global min and max time to estimate delivery (not per seller)
norm = Normalize(vmin=global_vmin, vmax=global_vmax)
# Create a scatterplot of customer locations based on time to estimate delivery
scatter = sns.scatterplot(
x=seller_data['c_geolocation_lng'],
y=seller_data['c_geolocation_lat'],
hue=seller_data['time_to_estimate_delivery_days'], # Use time to estimate delivery as color metric
palette=cmap, # Custom colormap
size=seller_data['total_payment_value'], # Size based on total_payment_value
sizes=(20, 200), # Adjust min/max size range of customer dots
ax=axes[i], # Use the corresponding subplot
legend=False, # No legend for individual plots
hue_norm=norm # Normalize hue for consistent mapping
)
# Plot the seller's location as a black dot with a smaller size
axes[i].scatter(
seller_data['s_geolocation_lng'].mean(),
seller_data['s_geolocation_lat'].mean(),
color='black',
s=150, # Size for seller's location
label='Seller Location'
)
# Add subtitle for each plot based on seller ranking
titles = [
"Largest Seller", "2nd Largest Seller", "3rd Largest Seller",
"4th Largest Seller", "5th Largest Seller", "6th Largest Seller",
"7th Largest Seller", "8th Largest Seller", "9th Largest Seller", "10th Largest Seller"
]
axes[i].set_title(titles[i], fontsize=14, pad=20)
# Set labels for the axes and adjust padding
axes[i].set_xlabel('Longitude', labelpad=20)
axes[i].set_ylabel('Latitude', labelpad=20)
# Remove the top, left, and right spines
axes[i].spines['top'].set_visible(False)
axes[i].spines['left'].set_visible(False)
axes[i].spines['right'].set_visible(False)
else:
# Hide the empty subplot
axes[i].set_visible(False)
# Create a global colorbar for time to estimate delivery days
cmap = colormaps['viridis_r']
norm = Normalize(vmin=global_vmin, vmax=global_vmax) # Global normalization
cbar = plt.colorbar(
plt.cm.ScalarMappable(norm=norm, cmap=cmap),
ax=axes,
orientation='horizontal',
fraction=0.05,
pad=0.12
)
cbar.set_label('Time to Estimate Delivery Days', fontsize=12)
cbar.ax.tick_params(labelsize=10)
# Delete any unused subplots (if there are fewer than 12)
for j in range(len(largest_seller_ids), len(axes)):
fig.delaxes(axes[j])
# Adjust layout and add a main title
plt.subplots_adjust(
top=0.85,
bottom=0.18,
hspace=0.6,
wspace=0.35
)
plt.suptitle('Time to Estimate Delivery Days of 10 Largest Sellers, Dot Size by Customer Payment Value', fontsize=16)
# Show the plot
plt.show()
Average freight cost for customers¶
# @title Customer Freight Share for 10 Largest Sellers (Dot Size by Customer Payment Value)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.colors import Normalize
from matplotlib import colormaps
# Get the 10 largest sellers
largest_seller_ids = orders_cleaned['seller_id'].value_counts().nlargest(10).index.tolist()
# Create a 3x4 grid of subplots (sufficient for 10 largest sellers)
fig, axes = plt.subplots(3, 4, figsize=(24, 12)) # Adjusted to 3 rows and 4 columns, larger figure size
axes = axes.flatten() # Flatten the axes array for easier indexing
# Get global min and max for freight share for consistent color mapping
global_vmin = orders_cleaned['freight_share'].min()
global_vmax = orders_cleaned['freight_share'].max()
# Loop through the 10 largest sellers and plot
for i, seller_id in enumerate(largest_seller_ids):
# Filter data for the current seller
seller_data = orders_cleaned[orders_cleaned['seller_id'] == seller_id].copy()
# Check if there's data for the seller
if not seller_data.empty:
# Define custom colormap from viridis, inverted
cmap = colormaps['viridis_r']
# Normalize based on global min and max freight share (not per seller)
norm = Normalize(vmin=global_vmin, vmax=global_vmax)
# Create a scatterplot of customer locations based on freight share
scatter = sns.scatterplot(
x=seller_data['c_geolocation_lng'],
y=seller_data['c_geolocation_lat'],
hue=seller_data['freight_share'], # Use freight share as color metric
palette=cmap, # Custom colormap
size=seller_data['total_payment_value'], # Size based on total_payment_value
sizes=(20, 200), # Adjust min/max size range of customer dots
ax=axes[i], # Use the corresponding subplot
legend=False, # No legend for individual plots
hue_norm=norm # Normalize hue for consistent mapping
)
# Plot the seller's location as a black dot with a smaller size
axes[i].scatter(
seller_data['s_geolocation_lng'].mean(),
seller_data['s_geolocation_lat'].mean(),
color='black',
s=150, # Size for seller's location
label='Seller Location'
)
# Add subtitle for each plot based on seller ranking
titles = [
"Largest Seller", "2nd Largest Seller", "3rd Largest Seller",
"4th Largest Seller", "5th Largest Seller", "6th Largest Seller",
"7th Largest Seller", "8th Largest Seller", "9th Largest Seller", "10th Largest Seller"
]
axes[i].set_title(titles[i], fontsize=14, pad=20)
# Set labels for the axes and adjust padding
axes[i].set_xlabel('Longitude', labelpad=20)
axes[i].set_ylabel('Latitude', labelpad=20)
# Remove the top, left, and right spines
axes[i].spines['top'].set_visible(False)
axes[i].spines['left'].set_visible(False)
axes[i].spines['right'].set_visible(False)
else:
# Hide the empty subplot
axes[i].set_visible(False)
# Create a global colorbar for freight share
cmap = colormaps['viridis_r']
norm = Normalize(vmin=global_vmin, vmax=global_vmax) # Global normalization
cbar = plt.colorbar(
plt.cm.ScalarMappable(norm=norm, cmap=cmap),
ax=axes,
orientation='horizontal',
fraction=0.05,
pad=0.12
)
cbar.set_label('Freight Share', fontsize=12)
cbar.ax.tick_params(labelsize=10)
# Delete any unused subplots (if there are fewer than 12)
for j in range(len(largest_seller_ids), len(axes)):
fig.delaxes(axes[j])
# Adjust layout and add a main title
plt.subplots_adjust(
top=0.85,
bottom=0.18,
hspace=0.6,
wspace=0.35
)
plt.suptitle('Customer Freight Share for 10 Largest Sellers, Dot Size by Customer Payment Value', fontsize=16)
# Show the plot
plt.show()
Cost efficiency¶
# @title Cost Efficiency of 10 Largest Sellers (Dot Size by Customer Payment Value)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.colors import Normalize
from matplotlib import colormaps
# Get the 10 largest sellers
largest_seller_ids = orders_cleaned['seller_id'].value_counts().nlargest(10).index.tolist()
# Create a 3x4 grid of subplots (sufficient for 10 largest sellers)
fig, axes = plt.subplots(3, 4, figsize=(24, 12)) # Adjusted to 3 rows and 4 columns, larger figure size
axes = axes.flatten() # Flatten the axes array for easier indexing
# Get global min and max for cost efficiency for consistent color mapping
global_vmin = orders_cleaned['cost_efficiency'].min()
global_vmax = orders_cleaned['cost_efficiency'].max()
# Loop through the 10 largest sellers and plot
for i, seller_id in enumerate(largest_seller_ids):
# Filter data for the current seller
seller_data = orders_cleaned[orders_cleaned['seller_id'] == seller_id].copy()
# Check if there's data for the seller
if not seller_data.empty:
# Define custom colormap from viridis
cmap = colormaps['viridis']
# Normalize based on global min and max cost efficiency (not per seller)
norm = Normalize(vmin=global_vmin, vmax=global_vmax)
# Create a scatterplot of customer locations based on cost efficiency
scatter = sns.scatterplot(
x=seller_data['c_geolocation_lng'],
y=seller_data['c_geolocation_lat'],
hue=seller_data['cost_efficiency'], # Use cost efficiency as color metric
palette=cmap, # Custom colormap
size=seller_data['total_payment_value'], # Size based on total_payment_value
sizes=(20, 200), # Adjust min/max size range of customer dots
ax=axes[i], # Use the corresponding subplot
legend=False, # No legend for individual plots
hue_norm=norm # Normalize hue for consistent mapping
)
# Plot the seller's location as a black dot with a smaller size
axes[i].scatter(
seller_data['s_geolocation_lng'].mean(),
seller_data['s_geolocation_lat'].mean(),
color='black',
s=150, # Size for seller's location
label='Seller Location'
)
# Add subtitle for each plot based on seller ranking
titles = [
"Largest Seller", "2nd Largest Seller", "3rd Largest Seller",
"4th Largest Seller", "5th Largest Seller", "6th Largest Seller",
"7th Largest Seller", "8th Largest Seller", "9th Largest Seller", "10th Largest Seller"
]
axes[i].set_title(titles[i], fontsize=14, pad=20)
# Set labels for the axes and adjust padding
axes[i].set_xlabel('Longitude', labelpad=20)
axes[i].set_ylabel('Latitude', labelpad=20)
# Remove the top, left, and right spines
axes[i].spines['top'].set_visible(False)
axes[i].spines['left'].set_visible(False)
axes[i].spines['right'].set_visible(False)
else:
# Hide the empty subplot
axes[i].set_visible(False)
# Create a global colorbar for cost efficiency
cmap = colormaps['viridis']
norm = Normalize(vmin=global_vmin, vmax=global_vmax) # Global normalization
cbar = plt.colorbar(
plt.cm.ScalarMappable(norm=norm, cmap=cmap),
ax=axes,
orientation='horizontal',
fraction=0.05,
pad=0.12
)
cbar.set_label('Cost Efficiency', fontsize=12)
cbar.ax.tick_params(labelsize=10)
# Delete any unused subplots (if there are fewer than 12)
for j in range(len(largest_seller_ids), len(axes)):
fig.delaxes(axes[j])
# Adjust layout and add a main title
plt.subplots_adjust(
top=0.85,
bottom=0.18,
hspace=0.6,
wspace=0.35
)
plt.suptitle('Cost Efficiency of 10 Largest Sellers, Dot Size by Customer Payment Value', fontsize=16)
# Show the plot
plt.show()
Customer reviews¶
# @title Customer Review Score of 10 Largest Sellers (Dot Size by Customer Payment Value)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.colors import Normalize
from matplotlib import colormaps
# Get the 10 largest sellers
largest_seller_ids = orders_cleaned['seller_id'].value_counts().nlargest(10).index.tolist()
# Create a 3x4 grid of subplots (sufficient for 10 largest sellers)
fig, axes = plt.subplots(3, 4, figsize=(24, 12)) # Adjusted to 3 rows and 4 columns, larger figure size
axes = axes.flatten() # Flatten the axes array for easier indexing
# Get global min and max for review scores for consistent color mapping
global_vmin = orders_cleaned['review_score'].min()
global_vmax = orders_cleaned['review_score'].max()
# Loop through the 10 largest sellers and plot
for i, seller_id in enumerate(largest_seller_ids):
# Filter data for the current seller
seller_data = orders_cleaned[orders_cleaned['seller_id'] == seller_id].copy()
# Check if there's data for the seller
if not seller_data.empty:
# Define custom colormap from viridis
cmap = colormaps['viridis']
# Normalize based on global min and max review score (not per seller)
norm = Normalize(vmin=global_vmin, vmax=global_vmax)
# Create a scatterplot of customer locations based on review score
scatter = sns.scatterplot(
x=seller_data['c_geolocation_lng'],
y=seller_data['c_geolocation_lat'],
hue=seller_data['review_score'], # Use review score as color metric
palette=cmap, # Custom colormap
size=seller_data['total_payment_value'], # Size based on total_payment_value
sizes=(20, 200), # Adjust min/max size range of customer dots
ax=axes[i], # Use the corresponding subplot
legend=False, # No legend for individual plots
hue_norm=norm # Normalize hue for consistent mapping
)
# Plot the seller's location as a black dot with a smaller size
axes[i].scatter(
seller_data['s_geolocation_lng'].mean(),
seller_data['s_geolocation_lat'].mean(),
color='black',
s=150, # Size for seller's location
label='Seller Location'
)
# Add subtitle for each plot based on seller ranking
titles = [
"Largest Seller", "2nd Largest Seller", "3rd Largest Seller",
"4th Largest Seller", "5th Largest Seller", "6th Largest Seller",
"7th Largest Seller", "8th Largest Seller", "9th Largest Seller", "10th Largest Seller"
]
axes[i].set_title(titles[i], fontsize=14, pad=20)
# Set labels for the axes and adjust padding
axes[i].set_xlabel('Longitude', labelpad=20)
axes[i].set_ylabel('Latitude', labelpad=20)
# Remove the top, left, and right spines
axes[i].spines['top'].set_visible(False)
axes[i].spines['left'].set_visible(False)
axes[i].spines['right'].set_visible(False)
else:
# Hide the empty subplot
axes[i].set_visible(False)
# Create a global colorbar for review score
cmap = colormaps['viridis']
norm = Normalize(vmin=global_vmin, vmax=global_vmax) # Global normalization
cbar = plt.colorbar(
plt.cm.ScalarMappable(norm=norm, cmap=cmap),
ax=axes,
orientation='horizontal',
fraction=0.05,
pad=0.12
)
cbar.set_label('Review Score', fontsize=12)
cbar.ax.tick_params(labelsize=10)
# Delete any unused subplots (if there are fewer than 12)
for j in range(len(largest_seller_ids), len(axes)):
fig.delaxes(axes[j])
# Adjust layout and add a main title
plt.subplots_adjust(
top=0.85,
bottom=0.18,
hspace=0.6,
wspace=0.35
)
plt.suptitle('Customer Review Score of 10 Largest Sellers, Dot Size by Customer Payment Value', fontsize=16)
# Show the plot
plt.show()
Plotting 10 largest product categories¶
Delivery delay (in days)¶
###
# Visualizing delivery delays (in days) by product category is worthwhile for several reasons:
#
# 1. Understanding Delivery Performance by Category: By plotting delivery delays for different product categories, businesses can
# identify which categories are experiencing longer delivery times. This insight enables companies to focus on improving
# logistics and operational efficiencies for specific categories, ensuring that customers receive their orders in a timely
# manner.
#
# 2. Enhancing Customer Experience: Understanding how delivery delays vary by category can help businesses manage customer
# expectations more effectively. If certain categories consistently have longer delivery times, businesses can proactively
# communicate this information to customers, adjust marketing strategies, or implement changes to improve service levels in
# those categories, ultimately enhancing customer satisfaction.
#
# 3. Informed Inventory and Supplier Management: Analyzing delivery delays by category allows businesses to make data-driven
# decisions regarding inventory management and supplier selection. If specific categories show persistent delays, businesses
# can reevaluate their supplier relationships, explore alternative logistics providers, or adjust inventory levels to mitigate
# these delays and better align supply with customer demand.
###
# @title Delivery Delay (in days) by Product Category (10 Largest Categories)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.colors import Normalize
from matplotlib import colormaps
# Define the mapping of product categories to English
category_mapping = {
'cama_mesa_banho': 'Bedding & Bath',
'beleza_saude': 'Beauty & Health',
'esporte_lazer': 'Sports & Leisure',
'informatica_acessorios': 'Computers & Accessories',
'moveis_decoracao': 'Furniture & Decoration',
'utilidades_domesticas': 'Household Items',
'relogios_presentes': 'Watches & Gifts',
'telefonia': 'Telephony',
'automotivo': 'Automotive',
'brinquedos': 'Toys',
'cool_stuff': 'Cool Stuff',
'ferramentas_jardim': 'Tools & Garden',
'perfumaria': 'Perfumery',
'bebes': 'Babies',
'eletronicos': 'Electronics'
}
# Get the 10 largest product categories
largest_categories = orders_cleaned['product_category_name'].value_counts().nlargest(10).index.tolist()
# Create a 4x3 grid of subplots (sufficient for 10 largest categories, with some empty spaces)
fig, axes = plt.subplots(4, 3, figsize=(20, 16)) # 4 rows and 3 columns for 10 plots
axes = axes.flatten() # Flatten the axes array for easier indexing
# Loop through the 10 largest categories and plot
for i, category in enumerate(largest_categories):
# Filter data for the current product category
category_data = orders_cleaned[orders_cleaned['product_category_name'] == category].copy()
# Check if there's data for the category
if not category_data.empty:
# Get the column data we want to plot
metric = category_data['time_to_estimate_delivery_days']
# Define custom colormap from viridis, inverted to show shorter delays in darker shades
cmap = colormaps['viridis_r']
# Determine vmin and vmax for color normalization
vmin = metric.min()
vmax = metric.max()
# Ensure vmax is greater than vmin
if vmin == vmax:
vmax = vmin + 1 # Ensure there's a range for normalization
# Normalize based on range of values
norm = Normalize(vmin=vmin, vmax=vmax)
# Create a scatterplot of customer locations based on delivery delay
scatter = sns.scatterplot(
x=category_data['c_geolocation_lng'],
y=category_data['c_geolocation_lat'],
hue=metric, # Use delivery delay as color metric
palette=cmap, # Custom colormap
sizes=(20, 200), # Adjust min/max size range of customer dots
ax=axes[i], # Use the corresponding subplot
legend=False, # No legend for individual plots
hue_norm=norm # Normalize hue for consistent mapping
)
# Add subtitle for each plot based on product category (with mapping to English)
axes[i].set_title(f'Category: {category_mapping.get(category, category)}', fontsize=14)
# Set labels for the axes
axes[i].set_xlabel('Longitude')
axes[i].set_ylabel('Latitude')
# Remove the top, left, and right spines for cleaner look
axes[i].spines['top'].set_visible(False)
axes[i].spines['left'].set_visible(False)
axes[i].spines['right'].set_visible(False)
# Add a color bar as a legend
cbar = plt.colorbar(
plt.cm.ScalarMappable(norm=norm, cmap=cmap),
ax=axes[i],
orientation='vertical'
)
cbar.set_label('Delivery Delay (days)', fontsize=12)
cbar.ax.tick_params(labelsize=10) # Change the tick label size
# Set ticks with larger intervals
tick_positions = np.arange(int(vmin), int(vmax) + 1, max(1, (vmax - vmin) // 5)) # Adjust interval size
cbar.set_ticks(tick_positions)
cbar.ax.set_yticklabels([f'{int(tick)}' for tick in tick_positions]) # Format ticks as integers
else:
# Hide the empty subplot
axes[i].set_visible(False)
# Delete the empty subplots if there are fewer than 12
for j in range(len(largest_categories), len(axes)):
fig.delaxes(axes[j])
# Adjust layout for clarity
plt.tight_layout(rect=[0, 0, 1, 0.95]) # Adjust to avoid overlapping titles
# Set main title for the entire figure
plt.suptitle('Delivery Delay (in days) by Product Category', fontsize=16)
# Show the plot
plt.show()
Look at only late deliveries¶
# @title Delivery Delay (in days) by Product Category (10 Largest Categories with Late Deliveries Only)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.colors import Normalize
from matplotlib import colormaps
# Define the mapping of product categories to English
category_mapping = {
'cama_mesa_banho': 'Bedding & Bath',
'beleza_saude': 'Beauty & Health',
'esporte_lazer': 'Sports & Leisure',
'informatica_acessorios': 'Computers & Accessories',
'moveis_decoracao': 'Furniture & Decoration',
'utilidades_domesticas': 'Household Items',
'relogios_presentes': 'Watches & Gifts',
'telefonia': 'Telephony',
'automotivo': 'Automotive',
'brinquedos': 'Toys',
'cool_stuff': 'Cool Stuff',
'ferramentas_jardim': 'Tools & Garden',
'perfumaria': 'Perfumery',
'bebes': 'Babies',
'eletronicos': 'Electronics'
}
# Get the 10 largest product categories
largest_categories = orders_cleaned['product_category_name'].value_counts().nlargest(10).index.tolist()
# Filter only late deliveries
late_deliveries = orders_cleaned[orders_cleaned['time_to_estimate_delivery_days'] > 0]
# Create a 4x3 grid of subplots (sufficient for 10 largest categories, with some empty spaces)
fig, axes = plt.subplots(4, 3, figsize=(20, 16)) # 4 rows and 3 columns for 10 plots
axes = axes.flatten() # Flatten the axes array for easier indexing
# Loop through the 10 largest categories and plot
for i, category in enumerate(largest_categories):
# Filter data for the current product category
category_data = late_deliveries[late_deliveries['product_category_name'] == category].copy()
# Check if there's data for the category
if not category_data.empty:
# Get the column data we want to plot
metric = category_data['time_to_estimate_delivery_days']
# Define custom colormap from viridis, inverted to show shorter delays in darker shades
cmap = colormaps['viridis_r']
# Determine vmin and vmax for color normalization
vmin = metric.min()
vmax = metric.max()
# Ensure vmax is greater than vmin
if vmin == vmax:
vmax = vmin + 1 # Ensure there's a range for normalization
# Normalize based on range of values
norm = Normalize(vmin=vmin, vmax=vmax)
# Create a scatterplot of customer locations based on metric
scatter = sns.scatterplot(
x=category_data['c_geolocation_lng'],
y=category_data['c_geolocation_lat'],
hue=metric, # Use delivery delay as color metric
palette=cmap, # Custom colormap
sizes=(20, 200), # Adjust min/max size range of customer dots
ax=axes[i], # Use the corresponding subplot
legend=False, # No legend for individual plots
hue_norm=norm # Normalize hue for consistent mapping
)
# Add subtitle for each plot based on product category (with mapping to English)
axes[i].set_title(f'Category: {category_mapping.get(category, category)}', fontsize=14)
# Set labels for the axes
axes[i].set_xlabel('Longitude')
axes[i].set_ylabel('Latitude')
# Remove the top, left, and right spines for cleaner look
axes[i].spines['top'].set_visible(False)
axes[i].spines['left'].set_visible(False)
axes[i].spines['right'].set_visible(False)
# Add a color bar as a legend
cbar = plt.colorbar(
plt.cm.ScalarMappable(norm=norm, cmap=cmap),
ax=axes[i],
orientation='vertical'
)
cbar.set_label('Delivery delay (days)', fontsize=12)
cbar.ax.tick_params(labelsize=10) # Change the tick label size
# Set ticks with larger intervals
tick_positions = np.arange(int(vmin), int(vmax) + 1, max(1, (vmax - vmin) // 5)) # Adjust interval size
cbar.set_ticks(tick_positions)
cbar.ax.set_yticklabels([f'{int(tick)}' for tick in tick_positions]) # Format ticks as integers
else:
# Hide the empty subplot
axes[i].set_visible(False)
# Now, delete the empty subplots
for j in range(len(largest_categories), len(axes)):
fig.delaxes(axes[j])
# Adjust layout for clarity
plt.tight_layout(rect=[0, 0, 1, 0.95]) # Adjust to avoid overlapping titles
# Set main title for the entire figure
plt.suptitle('Delivery Delay (in days) by Product Category (Late Deliveries Only)', fontsize=16)
# Show the plot
plt.show()
Delivery time (days)¶
# @title Delivery Time (in days) by Product Category (10 Largest Categories)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.colors import Normalize
from matplotlib import colormaps
# Define the mapping of product categories to English
category_mapping = {
'cama_mesa_banho': 'Bedding & Bath',
'beleza_saude': 'Beauty & Health',
'esporte_lazer': 'Sports & Leisure',
'informatica_acessorios': 'Computers & Accessories',
'moveis_decoracao': 'Furniture & Decoration',
'utilidades_domesticas': 'Household Items',
'relogios_presentes': 'Watches & Gifts',
'telefonia': 'Telephony',
'automotivo': 'Automotive',
'brinquedos': 'Toys',
'cool_stuff': 'Cool Stuff',
'ferramentas_jardim': 'Tools & Garden',
'perfumaria': 'Perfumery',
'bebes': 'Babies',
'eletronicos': 'Electronics'
}
# Get the 10 largest product categories
largest_categories = orders_cleaned['product_category_name'].value_counts().nlargest(10).index.tolist()
# Create a 4x3 grid of subplots (sufficient for 10 largest categories, with some empty spaces)
fig, axes = plt.subplots(4, 3, figsize=(20, 16)) # 4 rows and 3 columns for 10 plots
axes = axes.flatten() # Flatten the axes array for easier indexing
# Loop through the 10 largest categories and plot
for i, category in enumerate(largest_categories):
# Filter data for the current product category
category_data = orders_cleaned[orders_cleaned['product_category_name'] == category].copy()
# Check if there's data for the category
if not category_data.empty:
# Get the column data we want to plot
metric = category_data['delivery_time_days']
# Define custom colormap from viridis, inverted to show shorter times in darker shades
cmap = colormaps['viridis_r']
# Determine vmin and vmax for color normalization
vmin = metric.min()
vmax = metric.max()
# Ensure vmax is greater than vmin
if vmin == vmax:
vmax = vmin + 1 # Ensure there's a range for normalization
# Normalize based on range of values
norm = Normalize(vmin=vmin, vmax=vmax)
# Create a scatterplot of customer locations based on delivery time
scatter = sns.scatterplot(
x=category_data['c_geolocation_lng'],
y=category_data['c_geolocation_lat'],
hue=metric, # Use delivery time as color metric
palette=cmap, # Custom colormap
sizes=(20, 200), # Adjust min/max size range of customer dots
ax=axes[i], # Use the corresponding subplot
legend=False, # No legend for individual plots
hue_norm=norm # Normalize hue for consistent mapping
)
# Add subtitle for each plot based on product category (with mapping to English)
axes[i].set_title(f'Category: {category_mapping.get(category, category)}', fontsize=14)
# Set labels for the axes
axes[i].set_xlabel('Longitude')
axes[i].set_ylabel('Latitude')
# Remove the top, left, and right spines for a cleaner look
axes[i].spines['top'].set_visible(False)
axes[i].spines['left'].set_visible(False)
axes[i].spines['right'].set_visible(False)
# Add a color bar as a legend
cbar = plt.colorbar(
plt.cm.ScalarMappable(norm=norm, cmap=cmap),
ax=axes[i],
orientation='vertical'
)
cbar.set_label('Delivery time (days)', fontsize=12)
cbar.ax.tick_params(labelsize=10) # Change the tick label size
# Set ticks with larger intervals
tick_positions = np.arange(int(vmin), int(vmax) + 1, max(1, (vmax - vmin) // 5)) # Adjust interval size
cbar.set_ticks(tick_positions)
cbar.ax.set_yticklabels([f'{int(tick)}' for tick in tick_positions]) # Format ticks as integers
else:
# Hide the empty subplot
axes[i].set_visible(False)
# Delete any unused subplots (if there are fewer than 12)
for j in range(len(largest_categories), len(axes)):
fig.delaxes(axes[j])
# Adjust layout for clarity
plt.tight_layout(rect=[0, 0, 1, 0.95]) # Adjust to avoid overlapping titles
# Set main title for the entire figure
plt.suptitle('Delivery Time (in days) by Product Category', fontsize=16)
# Show the plot
plt.show()
Sellers by Product Categories¶
Delivery time in days¶
# @title Delivery Time (days) by Product Category (Seller Locations, Dot Size by Orders Filled)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.colors import Normalize
from matplotlib import colormaps
# Mapping of product categories from Portuguese to English
category_translation = {
'cama_mesa_banho': 'Bedding, Bath & Table',
'beleza_saude': 'Beauty & Health',
'esporte_lazer': 'Sports & Leisure',
'utilidades_domesticas': 'Household Items',
'moveis_decoracao': 'Furniture & Decoration',
'informatica_acessorios': 'Computers & Accessories',
'relogios_presentes': 'Watches & Gifts',
'brinquedos': 'Toys',
'telefonia': 'Telephony',
'automotivo': 'Automotive'
}
# Get the 10 largest product categories
largest_categories = orders_cleaned['product_category_name'].value_counts().nlargest(10).index.tolist()
# Create a 4x3 grid of subplots (sufficient for 10 largest categories, with some empty spaces)
fig, axes = plt.subplots(4, 3, figsize=(20, 16)) # 4 rows and 3 columns for 10 plots
axes = axes.flatten() # Flatten the axes array for easier indexing
# Loop through the 10 largest categories and plot
for i, category in enumerate(largest_categories):
# Filter data for the current product category
category_data = orders_cleaned[orders_cleaned['product_category_name'] == category].copy()
# Aggregate seller information by seller_id
seller_data = category_data.groupby('seller_id').agg({
's_geolocation_lng': 'mean', # Mean longitude of seller
's_geolocation_lat': 'mean', # Mean latitude of seller
'delivery_time_days': 'mean', # Mean delivery time
'order_id': 'count' # Count of orders fulfilled by the seller
}).reset_index()
# Check if there's data for the category
if not seller_data.empty:
# Define custom colormap using inverted viridis for darker shades on shorter times
cmap = colormaps['viridis_r']
# Determine vmin and vmax for color normalization
vmin = seller_data['delivery_time_days'].min()
vmax = seller_data['delivery_time_days'].max()
norm = Normalize(vmin=vmin, vmax=vmax)
# Create a scatterplot of seller locations based on delivery time
scatter = sns.scatterplot(
x=seller_data['s_geolocation_lng'],
y=seller_data['s_geolocation_lat'],
hue=seller_data['delivery_time_days'], # Use delivery time as a color metric
size=seller_data['order_id'], # Use order count to size the dots
sizes=(50, 400), # Adjust size range of seller dots
ax=axes[i], # Use the corresponding subplot
legend=False, # No legend for individual plots
hue_norm=norm, # Normalize hue for consistent mapping
palette=cmap # Custom colormap
)
# Add subtitle for each plot based on product category (translated to English)
translated_category = category_translation.get(category, category) # Fallback to original if not found
axes[i].set_title(f'Category: {translated_category}', fontsize=14)
# Set labels for the axes
axes[i].set_xlabel('Seller Longitude')
axes[i].set_ylabel('Seller Latitude')
# Remove the top, left, and right spines for a cleaner look
axes[i].spines['top'].set_visible(False)
axes[i].spines['left'].set_visible(False)
axes[i].spines['right'].set_visible(False)
# Add a color bar as a legend for delivery times
cbar = plt.colorbar(
plt.cm.ScalarMappable(norm=norm, cmap=cmap),
ax=axes[i],
orientation='vertical'
)
cbar.set_label('Delivery Time (days)', fontsize=12)
cbar.ax.tick_params(labelsize=10) # Change the tick label size
# Set ticks with larger intervals
tick_positions = np.arange(int(vmin), int(vmax) + 1, max(1, (vmax - vmin) // 5)) # Adjust interval size
cbar.set_ticks(tick_positions)
cbar.ax.set_yticklabels([f'{int(tick)}' for tick in tick_positions]) # Format ticks as integers
else:
# Hide the empty subplot
axes[i].set_visible(False)
# Now, delete any empty subplots (if there are fewer than 12)
for j in range(len(largest_categories), len(axes)):
fig.delaxes(axes[j])
# Adjust layout for clarity
plt.tight_layout(rect=[0, 0, 1, 0.95]) # Adjust to avoid overlapping titles
# Set main title for the entire figure
plt.suptitle('Delivery Time (days) by Product Category (Seller Locations, Dot Size by Orders Filled)', fontsize=16)
# Show the plot
plt.show()
Delivery to Estimate (in days)¶
# @title Delivery to Estimate (days) by Product Category (Seller Locations, Dot Size by Orders Filled)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.colors import Normalize
from matplotlib import colormaps
# Mapping of product categories from Portuguese to English
category_translation = {
'cama_mesa_banho': 'Bedding, Bath & Table',
'beleza_saude': 'Beauty & Health',
'esporte_lazer': 'Sports & Leisure',
'utilidades_domesticas': 'Household Items',
'moveis_decoracao': 'Furniture & Decoration',
'informatica_acessorios': 'Computers & Accessories',
'relogios_presentes': 'Watches & Gifts',
'brinquedos': 'Toys',
'telefonia': 'Telephony',
'automotivo': 'Automotive'
}
# Get the 10 largest product categories
largest_categories = orders_cleaned['product_category_name'].value_counts().nlargest(10).index.tolist()
# Create a 4x3 grid of subplots (sufficient for 10 largest categories, with some empty spaces)
fig, axes = plt.subplots(4, 3, figsize=(20, 16)) # 4 rows and 3 columns for 10 plots
axes = axes.flatten() # Flatten the axes array for easier indexing
# Loop through the 10 largest categories and plot
for i, category in enumerate(largest_categories):
# Filter data for the current product category
category_data = orders_cleaned[orders_cleaned['product_category_name'] == category].copy()
# Aggregate seller information by seller_id
seller_data = category_data.groupby('seller_id').agg({
's_geolocation_lng': 'mean', # Mean longitude of seller
's_geolocation_lat': 'mean', # Mean latitude of seller
'time_to_estimate_delivery_days': 'mean', # Mean delivery time
'order_id': 'count' # Count of orders fulfilled by the seller
}).reset_index()
# Check if there's data for the category
if not seller_data.empty:
# Define custom colormap using inverted viridis for shorter times in darker shades
cmap = colormaps['viridis_r']
# Normalize based on range of delivery time values
vmin = seller_data['time_to_estimate_delivery_days'].min()
vmax = seller_data['time_to_estimate_delivery_days'].max()
norm = Normalize(vmin=vmin, vmax=vmax)
# Create a scatterplot of seller locations based on delivery time
scatter = sns.scatterplot(
x=seller_data['s_geolocation_lng'],
y=seller_data['s_geolocation_lat'],
hue=seller_data['time_to_estimate_delivery_days'], # Delivery time as color metric
size=seller_data['order_id'], # Size dots by order count
sizes=(50, 400), # Size range for order count
ax=axes[i], # Corresponding subplot
legend=False, # No individual legend
hue_norm=norm, # Normalize hue for consistency
palette=cmap # Viridis colormap
)
# Translate the category name to English
translated_category = category_translation.get(category, category) # Fallback to original if not found
axes[i].set_title(f'Category: {translated_category}', fontsize=14)
# Set labels for the axes
axes[i].set_xlabel('Seller Longitude')
axes[i].set_ylabel('Seller Latitude')
# Remove the top, left, and right spines for clarity
axes[i].spines['top'].set_visible(False)
axes[i].spines['left'].set_visible(False)
axes[i].spines['right'].set_visible(False)
# Add a color bar for delivery times
cbar = plt.colorbar(
plt.cm.ScalarMappable(norm=norm, cmap=cmap),
ax=axes[i],
orientation='vertical'
)
cbar.set_label('Delivery to Estimate (days)', fontsize=12)
cbar.ax.tick_params(labelsize=10) # Adjust tick label size
# Set ticks with larger intervals
tick_positions = np.arange(int(vmin), int(vmax) + 1, max(1, (vmax - vmin) // 5))
cbar.set_ticks(tick_positions)
cbar.ax.set_yticklabels([f'{int(tick)}' for tick in tick_positions])
else:
# Hide the empty subplot
axes[i].set_visible(False)
# Delete any unused subplots (if there are fewer than 12)
for j in range(len(largest_categories), len(axes)):
fig.delaxes(axes[j])
# Adjust layout for clarity
plt.tight_layout(rect=[0, 0, 1, 0.95])
# Set main title for the entire figure
plt.suptitle('Delivery to Estimate (days) by Product Category (Seller Locations, Dot Size by Orders Filled)', fontsize=16)
# Show the plot
plt.show()
Cost Efficiency (freight value over delivery time in days)¶
# @title Cost Efficiency by Product Category (Seller Locations, Dot Size by Orders Filled)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.colors import Normalize
from matplotlib import colormaps
# Mapping of product categories from Portuguese to English
category_translation = {
'cama_mesa_banho': 'Bedding, Bath & Table',
'beleza_saude': 'Beauty & Health',
'esporte_lazer': 'Sports & Leisure',
'utilidades_domesticas': 'Household Items',
'moveis_decoracao': 'Furniture & Decoration',
'informatica_acessorios': 'Computers & Accessories',
'relogios_presentes': 'Watches & Gifts',
'brinquedos': 'Toys',
'telefonia': 'Telephony',
'automotivo': 'Automotive'
}
# Get the 10 largest product categories
largest_categories = orders_cleaned['product_category_name'].value_counts().nlargest(10).index.tolist()
# Create a 4x3 grid of subplots (sufficient for 10 largest categories, with some empty spaces)
fig, axes = plt.subplots(4, 3, figsize=(20, 16)) # 4 rows and 3 columns for 10 plots
axes = axes.flatten() # Flatten the axes array for easier indexing
# Loop through the 10 largest categories and plot
for i, category in enumerate(largest_categories):
# Filter data for the current product category
category_data = orders_cleaned[orders_cleaned['product_category_name'] == category].copy()
# Aggregate seller information by seller_id
seller_data = category_data.groupby('seller_id').agg({
's_geolocation_lng': 'mean', # Mean longitude of seller
's_geolocation_lat': 'mean', # Mean latitude of seller
'cost_efficiency': 'mean', # Mean cost efficiency
'order_id': 'count' # Count of orders fulfilled by the seller
}).reset_index()
# Check if there's data for the category
if not seller_data.empty:
# Get the cost efficiency metric
metric = seller_data['cost_efficiency']
# Use 'viridis' colormap
cmap = colormaps['viridis']
# Normalize based on range of cost efficiency values
vmin, vmax = metric.min(), metric.max()
norm = Normalize(vmin=vmin, vmax=vmax)
# Create a scatterplot of seller locations based on cost efficiency
scatter = sns.scatterplot(
x=seller_data['s_geolocation_lng'],
y=seller_data['s_geolocation_lat'],
hue=metric, # Use cost efficiency as color metric
size=seller_data['order_id'], # Dot size based on order count
sizes=(50, 400), # Size range for dots
ax=axes[i], # Corresponding subplot
legend=False, # No individual legend
hue_norm=norm, # Normalize hue for consistent mapping
palette=cmap # Custom colormap
)
# Translate the category name to English
translated_category = category_translation.get(category, category)
axes[i].set_title(f'Category: {translated_category}', fontsize=14)
# Set axis labels
axes[i].set_xlabel('Seller Longitude')
axes[i].set_ylabel('Seller Latitude')
# Remove top, left, and right spines for clarity
axes[i].spines['top'].set_visible(False)
axes[i].spines['left'].set_visible(False)
axes[i].spines['right'].set_visible(False)
# Add a color bar for cost efficiency
cbar = plt.colorbar(
plt.cm.ScalarMappable(norm=norm, cmap=cmap),
ax=axes[i],
orientation='vertical'
)
cbar.set_label('Cost Efficiency', fontsize=12)
cbar.ax.tick_params(labelsize=10)
# Set ticks with larger intervals for readability
tick_positions = np.linspace(vmin, vmax, num=5, dtype=int)
cbar.set_ticks(tick_positions)
cbar.ax.set_yticklabels([f'{int(tick)}' for tick in tick_positions])
else:
# Hide empty subplot
axes[i].set_visible(False)
# Delete any unused subplots (if fewer than 12)
for j in range(len(largest_categories), len(axes)):
fig.delaxes(axes[j])
# Adjust layout for clarity
plt.tight_layout(rect=[0, 0, 1, 0.95])
# Set main title for the figure
plt.suptitle('Cost Efficiency by Product Category (Seller Locations, Dot Size by Orders Filled)', fontsize=16)
# Show the plot
plt.show()
Distance Efficiency (distance in km over delivery time days)¶
# @title Distance Efficiency by Product Category (Seller Locations, Dot Size by Orders Filled)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.colors import Normalize
from matplotlib import colormaps
# Mapping of product categories from Portuguese to English
category_translation = {
'cama_mesa_banho': 'Bedding, Bath & Table',
'beleza_saude': 'Beauty & Health',
'esporte_lazer': 'Sports & Leisure',
'utilidades_domesticas': 'Household Items',
'moveis_decoracao': 'Furniture & Decoration',
'informatica_acessorios': 'Computers & Accessories',
'relogios_presentes': 'Watches & Gifts',
'brinquedos': 'Toys',
'telefonia': 'Telephony',
'automotivo': 'Automotive'
}
# Get the 10 largest product categories
largest_categories = orders_cleaned['product_category_name'].value_counts().nlargest(10).index.tolist()
# Create a 4x3 grid of subplots (sufficient for 10 largest categories, with some empty spaces)
fig, axes = plt.subplots(4, 3, figsize=(20, 16)) # 4 rows and 3 columns for 10 plots
axes = axes.flatten() # Flatten the axes array for easier indexing
# Loop through the 10 largest categories and plot
for i, category in enumerate(largest_categories):
# Filter data for the current product category
category_data = orders_cleaned[orders_cleaned['product_category_name'] == category].copy()
# Aggregate seller information by seller_id
seller_data = category_data.groupby('seller_id').agg({
's_geolocation_lng': 'mean', # Mean longitude of seller
's_geolocation_lat': 'mean', # Mean latitude of seller
'distance_efficiency': 'mean', # Mean distance efficiency
'order_id': 'count' # Count of orders fulfilled by the seller
}).reset_index()
# Check if there's data for the category
if not seller_data.empty:
# Get the distance efficiency metric
metric = seller_data['distance_efficiency']
# Use 'viridis' colormap
cmap = colormaps['viridis']
# Normalize based on range of distance efficiency values
vmin, vmax = metric.min(), metric.max()
norm = Normalize(vmin=vmin, vmax=vmax)
# Create a scatterplot of seller locations based on distance efficiency
scatter = sns.scatterplot(
x=seller_data['s_geolocation_lng'],
y=seller_data['s_geolocation_lat'],
hue=metric, # Use distance efficiency as color metric
size=seller_data['order_id'], # Dot size based on order count
sizes=(50, 400), # Size range for dots
ax=axes[i], # Corresponding subplot
legend=False, # No individual legend
hue_norm=norm, # Normalize hue for consistent mapping
palette=cmap # Custom colormap
)
# Translate the category name to English
translated_category = category_translation.get(category, category)
axes[i].set_title(f'Category: {translated_category}', fontsize=14)
# Set axis labels
axes[i].set_xlabel('Seller Longitude')
axes[i].set_ylabel('Seller Latitude')
# Add a color bar for distance efficiency
cbar = plt.colorbar(
plt.cm.ScalarMappable(norm=norm, cmap=cmap),
ax=axes[i],
orientation='vertical'
)
cbar.set_label('Distance Efficiency', fontsize=12)
cbar.ax.tick_params(labelsize=10)
# Set ticks with larger intervals for readability
tick_positions = np.linspace(vmin, vmax, num=5, dtype=int)
cbar.set_ticks(tick_positions)
cbar.ax.set_yticklabels([f'{int(tick)}' for tick in tick_positions])
else:
# Hide empty subplot
axes[i].set_visible(False)
# Delete any unused subplots (if fewer than 12)
for j in range(len(largest_categories), len(axes)):
fig.delaxes(axes[j])
# Adjust layout for clarity
plt.tight_layout(rect=[0, 0, 1, 0.95])
# Set main title for the figure
plt.suptitle('Distance Efficiency by Product Category (Seller Locations, Dot Size by Orders Filled)', fontsize=16)
# Show the plot
plt.show()
Freight Share¶
# @title Freight Share by Product Category (Seller Locations, Dot Size by Orders Filled)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.colors import Normalize
# Mapping of product categories from Portuguese to English
category_translation = {
'cama_mesa_banho': 'Bedding, Bath & Table',
'beleza_saude': 'Beauty & Health',
'esporte_lazer': 'Sports & Leisure',
'utilidades_domesticas': 'Household Items',
'moveis_decoracao': 'Furniture & Decoration',
'informatica_acessorios': 'Computers & Accessories',
'relogios_presentes': 'Watches & Gifts',
'brinquedos': 'Toys',
'telefonia': 'Telephony',
'automotivo': 'Automotive'
}
# Get the 10 largest product categories
largest_categories = orders_cleaned['product_category_name'].value_counts().nlargest(10).index.tolist()
# Create a 4x3 grid of subplots (sufficient for 10 largest categories, with some empty spaces)
fig, axes = plt.subplots(4, 3, figsize=(20, 16)) # 4 rows and 3 columns for 10 plots
axes = axes.flatten() # Flatten the axes array for easier indexing
# Choose colormap and invert if necessary
colormap = 'viridis'
# Invert colormap if lower values represent better performance
invert_cmap = True # Set based on metric logic
cmap = plt.cm.get_cmap(colormap + ("_r" if invert_cmap else ""))
# Loop through the 10 largest categories and plot
for i, category in enumerate(largest_categories):
# Filter data for the current product category
category_data = orders_cleaned[orders_cleaned['product_category_name'] == category].copy()
# Aggregate seller information by seller_id
seller_data = category_data.groupby('seller_id').agg({
's_geolocation_lng': 'mean', # Mean longitude of seller
's_geolocation_lat': 'mean', # Mean latitude of seller
'freight_share': 'mean', # Mean freight share
'order_id': 'count' # Count of orders fulfilled by the seller
}).reset_index()
# Check if there's data for the category
if not seller_data.empty:
# Get the column data we want to plot (mean freight share per seller)
metric = seller_data['freight_share']
# Set vmin and vmax for color normalization explicitly to 0 and 1
vmin = 0
vmax = 1
# Normalize based on range of values
norm = Normalize(vmin=vmin, vmax=vmax)
# Create a scatterplot of seller locations based on metric
scatter = sns.scatterplot(
x=seller_data['s_geolocation_lng'],
y=seller_data['s_geolocation_lat'],
hue=metric, # Use freight share as a color metric
size=seller_data['order_id'], # Use order count to size the dots
sizes=(50, 400), # Increased min/max size range of seller dots based on order count
ax=axes[i], # Use the corresponding subplot
legend=False, # No legend for individual plots
hue_norm=norm, # Normalize hue for consistent mapping
palette=cmap # Use Viridis colormap
)
# Add subtitle for each plot based on product category
translated_category = category_translation.get(category, category) # Fallback to original if not found
axes[i].set_title(f'Category: {translated_category}', fontsize=14)
# Set labels for the axes
axes[i].set_xlabel('Seller Longitude')
axes[i].set_ylabel('Seller Latitude')
# Add a color bar as a legend
cbar = plt.colorbar(
plt.cm.ScalarMappable(norm=norm, cmap=cmap),
ax=axes[i],
orientation='vertical'
)
cbar.set_label('Freight Share', fontsize=12)
cbar.ax.tick_params(labelsize=10) # Change the tick label size
# Set ticks manually for the color bar
tick_positions = np.linspace(vmin, vmax, 6) # 6 ticks from 0 to 1
cbar.set_ticks(tick_positions)
cbar.ax.set_yticklabels([f'{tick:.2f}' for tick in tick_positions]) # Format ticks to 2 decimal places
else:
# Hide the empty subplot
axes[i].set_visible(False)
# Now, delete the empty subplots
for j in range(len(largest_categories), len(axes)):
fig.delaxes(axes[j])
# Adjust layout for clarity
plt.tight_layout(rect=[0, 0, 1, 0.95]) # Adjust to avoid overlapping titles
# Set main title for the entire figure
plt.suptitle('Freight Share by Product Category (Seller Locations, Dot Size by Orders Filled)', fontsize=16)
# Show the plot
plt.show()